diff --git a/.github/config.yml b/.github/config.yml
new file mode 100644
index 0000000000..9fedbd1221
--- /dev/null
+++ b/.github/config.yml
@@ -0,0 +1,26 @@
+# Comment to be posted to on first time issues
+newIssueWelcomeComment: >
+  [![Welcome Banner](https://zenodo.org/api/iiif/v2/0c0188d3-d03c-4830-a6e3-00405f5c22fa:8ff47a85-7250-4d86-8e48-2f346b48b2c1:BannerWelcome.jpg/full/750,/0/default.jpg)](https://zenodo.org/record/3695300)
+
+  :tada: Welcome to _PyMC_! :tada:
+  We're really excited to have your input into the project! :sparkling_heart:
+
+  <br>If you haven't done so already, please make sure you check out our [Contributing Guidelines](https://www.pymc.io/projects/docs/en/latest/contributing/index.html) and [Code of Conduct](https://github.com/pymc-devs/pymc/blob/main/CODE_OF_CONDUCT.md).
+
+
+# Comment to be posted to on PRs from first time contributors in your repository
+newPRWelcomeComment: >
+  [![Thank You Banner](https://zenodo.org/api/iiif/v2/0c0188d3-d03c-4830-a6e3-00405f5c22fa:7fbd97cf-283b-480c-b8e1-11866e26245c:BannerThanks.jpg/full/750,/0/default.jpg)](https://zenodo.org/record/3695300)
+
+  :sparkling_heart: Thanks for opening this pull request! :sparkling_heart:
+  The _PyMC_ community really appreciates your time and effort to contribute to the project.
+  Please make sure you have read our [Contributing Guidelines](https://www.pymc.io/projects/docs/en/latest/contributing/index.html) and filled in our pull request template to the best of your ability.
+
+
+# Comment to be posted to on pull requests merged by a first time user
+firstPRMergeComment: >
+  [![Congratulations Banner](https://zenodo.org/api/iiif/v2/0c0188d3-d03c-4830-a6e3-00405f5c22fa:32fbdb89-ae1b-434e-830c-88ade86724cc:BannerCongratulations.jpg/full/750,/0/default.jpg)](https://zenodo.org/record/3695300)
+
+  Congrats on merging your first pull request! :tada:
+  We here at _PyMC_ are proud of you! :sparkling_heart:
+  Thank you so much for your contribution :gift:
diff --git a/.github/release.yml b/.github/release.yml
index af6e33d08c..1efe251d74 100644
--- a/.github/release.yml
+++ b/.github/release.yml
@@ -17,6 +17,7 @@ changelog:
       labels:
         - bug
     - title: Documentation 📖
+      labels:
         - docs
     - title: Maintenance 🔧
       labels:
diff --git a/.github/workflows/rtd-link-preview.yml b/.github/workflows/rtd-link-preview.yml
new file mode 100644
index 0000000000..626b410c38
--- /dev/null
+++ b/.github/workflows/rtd-link-preview.yml
@@ -0,0 +1,16 @@
+name: Read the Docs Pull Request Preview
+on:
+  pull_request_target:
+    types:
+      - opened
+
+permissions:
+  pull-requests: write
+
+jobs:
+  documentation-links:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: readthedocs/actions/preview@v1
+        with:
+          project-slug: "pymc"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 670f5895c2..d0bd850ca8 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -2,23 +2,10 @@ name: tests
 
 on:
   pull_request:
-    paths:
-      - ".github/workflows/tests.yml"
-      - "pymc/**.py"
-      - "*.py"
-      - "conda-envs/**"
-      - "codecov.yml"
-      - "requirements*.txt"
-      - "scripts/*.sh"
   push:
-    branches: [main]
-    paths:
-      - ".github/workflows/tests.yml"
-      - "pymc/**.py"
-      - "*.py"
-      - "conda-envs/**"
-      - "codecov.yml"
-      - "scripts/*.sh"
+    branches:
+      - main
+
 
 # Tests are split into multiple jobs to accelerate the CI.
 # Different jobs should be organized to take approximately the same
@@ -30,12 +17,38 @@ on:
 # enforces that test run just once per OS / floatX setting.
 
 jobs:
+
+  changes:
+    name: "Check for changes"
+    runs-on: ubuntu-latest
+    outputs:
+      changes: ${{ steps.changes.outputs.src }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - uses: dorny/paths-filter@v2
+        id: changes
+        with:
+          filters: |
+            src:
+              - ".github/workflows/tests.yml"
+              - "pymc/**/*.py"
+              - "tests/**/*.py"
+              - "*.py"
+              - "conda-envs/*"
+              - "requirements*.txt"
+              - "codecov.yml"
+              - "scripts/*.sh"
+
   ubuntu:
+    needs: changes
+    if: ${{ needs.changes.outputs.changes == 'true' }}
     strategy:
       matrix:
         os: [ubuntu-20.04]
         floatx: [float64]
-        python-version: ["3.10"]
+        python-version: ["3.11"]
         test-subset:
           - |
             tests/test_util.py
@@ -47,6 +60,7 @@ jobs:
             tests/test_func_utils.py
             tests/distributions/test_shape_utils.py
             tests/distributions/test_mixture.py
+            tests/test_testing.py
 
           - |
             tests/distributions/test_continuous.py
@@ -75,6 +89,7 @@ jobs:
           - |
             tests/distributions/test_timeseries.py
             tests/gp/test_cov.py
+            tests/gp/test_hsgp_approx.py
             tests/gp/test_gp.py
             tests/gp/test_mean.py
             tests/gp/test_util.py
@@ -88,10 +103,10 @@ jobs:
             tests/backends/test_mcbackend.py
             tests/distributions/test_truncated.py
             tests/logprob/test_abstract.py
+            tests/logprob/test_basic.py
             tests/logprob/test_censoring.py
             tests/logprob/test_composite_logprob.py
             tests/logprob/test_cumsum.py
-            tests/logprob/test_joint_logprob.py
             tests/logprob/test_mixture.py
             tests/logprob/test_rewriting.py
             tests/logprob/test_scan.py
@@ -158,7 +173,10 @@ jobs:
           env_vars: TEST_SUBSET
           name: ${{ matrix.os }} ${{ matrix.floatx }}
           fail_ci_if_error: false
+
   windows:
+    needs: changes
+    if: ${{ needs.changes.outputs.changes == 'true' }}
     strategy:
       matrix:
         os: [windows-latest]
@@ -231,7 +249,10 @@ jobs:
           env_vars: TEST_SUBSET
           name: ${{ matrix.os }} ${{ matrix.floatx }}
           fail_ci_if_error: false
+
   macos:
+    needs: changes
+    if: ${{ needs.changes.outputs.changes == 'true' }}
     strategy:
       matrix:
         os: [macos-latest]
@@ -307,12 +328,15 @@ jobs:
           env_vars: TEST_SUBSET
           name: ${{ matrix.os }} ${{ matrix.floatx }}
           fail_ci_if_error: false
+
   external_samplers:
+    needs: changes
+    if: ${{ needs.changes.outputs.changes == 'true' }}
     strategy:
       matrix:
         os: [ubuntu-20.04]
         floatx: [float64]
-        python-version: ["3.9"]
+        python-version: ["3.10"]
         test-subset:
           - tests/sampling/test_jax.py tests/sampling/test_mcmc_external.py
       fail-fast: false
@@ -377,12 +401,15 @@ jobs:
           env_vars: TEST_SUBSET
           name: JAX tests - ${{ matrix.os }} ${{ matrix.floatx }}
           fail_ci_if_error: false
+
   float32:
+    needs: changes
+    if: ${{ needs.changes.outputs.changes == 'true' }}
     strategy:
       matrix:
         os: [windows-latest]
         floatx: [float32]
-        python-version: ["3.10"]
+        python-version: ["3.11"]
         test-subset:
         - tests/sampling/test_mcmc.py tests/ode/test_ode.py tests/ode/test_utils.py
       fail-fast: false
@@ -446,3 +473,17 @@ jobs:
           env_vars: TEST_SUBSET
           name: ${{ matrix.os }} ${{ matrix.floatx }}
           fail_ci_if_error: false
+
+  all_tests:
+    if: ${{ always() }}
+    runs-on: ubuntu-latest
+    needs: [ changes, ubuntu, windows, macos, external_samplers, float32 ]
+    steps:
+      - name: Check build matrix status
+        if: ${{ needs.changes.outputs.changes == 'true' &&
+                ( needs.ubuntu.result != 'success' ||
+                  needs.windows.result != 'success' ||
+                  needs.macos.result != 'success' ||
+                  needs.external_samplers.result != 'success' ||
+                  needs.float32.result != 'success' ) }}
+        run: exit 1
diff --git a/.gitpod.yml b/.gitpod.yml
index 2fe0a5274c..9420f6b574 100644
--- a/.gitpod.yml
+++ b/.gitpod.yml
@@ -22,17 +22,14 @@ tasks:
 
       # Install dependencies
       sudo chown "$(id -u):$(id -g)" /opt/conda/conda-meta/history
-      micromamba install --yes --name base --file conda-envs/environment-dev.yml
-
-      # Install PyMC
-      pip install -e .
+      (micromamba install --yes --name base --file conda-envs/environment-dev.yml; pip install -e .) &> /tmp/install-init.log &
 
     command: |
       # Reinitialize devcontainer for good measure
       _dev-init.sh
 
       # Install the pre-commit hooks in the background if not already installed
-      pre-commit install --install-hooks
+      pre-commit install --install-hooks &> /tmp/pre-commit-init-output.log &
 
 vscode:
   extensions:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 632b62fccc..43124ee7a7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,12 +34,12 @@ repos:
     - id: pyupgrade
       args: [--py37-plus]
 - repo: https://github.com/psf/black
-  rev: 23.1.0
+  rev: 23.3.0
   hooks:
     - id: black
     - id: black-jupyter
 - repo: https://github.com/PyCQA/pylint
-  rev: v2.16.2
+  rev: v3.0.0a6
   hooks:
     - id: pylint
       args: [--rcfile=.pylintrc]
diff --git a/MANIFEST.in b/MANIFEST.in
index cad372f492..e0847b1a38 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +1,3 @@
-recursive-include tests/data *
-recursive-include source *
-# because of an upload-size limit by PyPI, we're temporarily removing docs from the tarball:
-recursive-exclude docs *
 include requirements.txt
 include *.md *.rst
 include scripts/*.sh
diff --git a/README.rst b/README.rst
index ce7e514f4e..182e52a498 100644
--- a/README.rst
+++ b/README.rst
@@ -151,6 +151,8 @@ Sponsors
 
 |PyMCLabs|
 
+|Mistplay|
+
 .. |Binder| image:: https://mybinder.org/badge_logo.svg
    :target: https://mybinder.org/v2/gh/pymc-devs/pymc/main?filepath=%2Fdocs%2Fsource%2Fnotebooks
 .. |Build Status| image:: https://github.com/pymc-devs/pymc/workflows/pytest/badge.svg
@@ -159,9 +161,11 @@ Sponsors
    :target: https://codecov.io/gh/pymc-devs/pymc
 .. |Dockerhub| image:: https://img.shields.io/docker/automated/pymc/pymc.svg
    :target: https://hub.docker.com/r/pymc/pymc
-.. |NumFOCUS| image:: https://www.numfocus.org/wp-content/uploads/2017/03/1457562110.png
-   :target: http://www.numfocus.org/
 .. |NumFOCUS_badge| image:: https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A
    :target: http://www.numfocus.org/
-.. |PyMCLabs| image:: https://raw.githubusercontent.com/pymc-devs/pymc/main/docs/logos/sponsors/pymc-labs.png
+.. |NumFOCUS| image:: https://github.com/pymc-devs/brand/blob/main/sponsors/sponsor_logos/sponsor_numfocus.png?raw=true
+   :target: http://www.numfocus.org/
+.. |PyMCLabs| image:: https://github.com/pymc-devs/brand/blob/main/sponsors/sponsor_logos/sponsor_pymc_labs.png?raw=true
    :target: https://pymc-labs.io
+.. |Mistplay| image:: https://github.com/pymc-devs/brand/blob/main/sponsors/sponsor_logos/sponsor_mistplay.png?raw=true
+   :target: https://www.mistplay.com/
diff --git a/benchmarks/benchmarks/benchmarks.py b/benchmarks/benchmarks/benchmarks.py
index 1033b638b2..0f1cd0f2f1 100644
--- a/benchmarks/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks/benchmarks.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pandas as pd
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 import pymc as pm
 
@@ -61,8 +61,8 @@ def mixture_model(random_seed=1234):
         mu = pm.Normal("mu", mu=0.0, sigma=10.0, shape=w_true.shape)
         enforce_order = pm.Potential(
             "enforce_order",
-            at.switch(mu[0] - mu[1] <= 0, 0.0, -np.inf)
-            + at.switch(mu[1] - mu[2] <= 0, 0.0, -np.inf),
+            pt.switch(mu[0] - mu[1] <= 0, 0.0, -np.inf)
+            + pt.switch(mu[1] - mu[2] <= 0, 0.0, -np.inf),
         )
         tau = pm.Gamma("tau", alpha=1.0, beta=1.0, shape=w_true.shape)
         pm.NormalMixture("x_obs", w=w, mu=mu, tau=tau, observed=x)
diff --git a/conda-envs/environment-dev.yml b/conda-envs/environment-dev.yml
index 2a1fb83894..a9680c8703 100644
--- a/conda-envs/environment-dev.yml
+++ b/conda-envs/environment-dev.yml
@@ -14,7 +14,7 @@ dependencies:
 - numpy>=1.15.0
 - pandas>=0.24.0
 - pip
-- pytensor=2.10.1
+- pytensor>=2.11.0,<2.12
 - python-graphviz
 - networkx
 - scipy>=1.4.1
diff --git a/conda-envs/environment-docs.yml b/conda-envs/environment-docs.yml
index e00978a0fe..cf21386681 100644
--- a/conda-envs/environment-docs.yml
+++ b/conda-envs/environment-docs.yml
@@ -12,7 +12,7 @@ dependencies:
 - numpy>=1.15.0
 - pandas>=0.24.0
 - pip
-- pytensor=2.9.1
+- pytensor>=2.11.0,<2.12
 - python-graphviz
 - scipy>=1.4.1
 - typing-extensions>=3.7.4
@@ -23,13 +23,13 @@ dependencies:
 - myst-nb
 - numpydoc
 - pre-commit>=2.8.0
+- pymc-sphinx-theme==0.13
 - sphinx-copybutton
 - sphinx-design
 - sphinx-notfound-page
-- sphinx>=1.5
+- sphinx>=5
 - sphinxext-rediraffe
 - watermark
 - sphinx-remove-toctrees
 - pip:
-  - git+https://github.com/pymc-devs/pymc-sphinx-theme
   - numdifftools>=0.9.40
diff --git a/conda-envs/environment-test.yml b/conda-envs/environment-test.yml
index 5ea181b329..9d7427ea27 100644
--- a/conda-envs/environment-test.yml
+++ b/conda-envs/environment-test.yml
@@ -17,7 +17,7 @@ dependencies:
 - numpy>=1.15.0
 - pandas>=0.24.0
 - pip
-- pytensor=2.10.1
+- pytensor>=2.11.0,<2.12
 - python-graphviz
 - networkx
 - scipy>=1.4.1
diff --git a/conda-envs/windows-environment-dev.yml b/conda-envs/windows-environment-dev.yml
index f7ad2ca141..82dc168cde 100644
--- a/conda-envs/windows-environment-dev.yml
+++ b/conda-envs/windows-environment-dev.yml
@@ -14,7 +14,7 @@ dependencies:
 - numpy>=1.15.0
 - pandas>=0.24.0
 - pip
-- pytensor=2.10.1
+- pytensor>=2.11.0,<2.12
 - python-graphviz
 - networkx
 - scipy>=1.4.1
diff --git a/conda-envs/windows-environment-test.yml b/conda-envs/windows-environment-test.yml
index 708467a066..6cdeb60947 100644
--- a/conda-envs/windows-environment-test.yml
+++ b/conda-envs/windows-environment-test.yml
@@ -17,7 +17,7 @@ dependencies:
 - numpy>=1.15.0
 - pandas>=0.24.0
 - pip
-- pytensor=2.10.1
+- pytensor>=2.11.0,<2.12
 - python-graphviz
 - networkx
 - scipy>=1.4.1
diff --git a/docs/source/PyMC_and_PyTensor.rst b/docs/source/PyMC_and_PyTensor.rst
index 28ffbb5427..a6c06cc22d 100644
--- a/docs/source/PyMC_and_PyTensor.rst
+++ b/docs/source/PyMC_and_PyTensor.rst
@@ -34,13 +34,13 @@ First, we need to define symbolic variables for our inputs (this
 is similar to eg SymPy's `Symbol`)::
 
     import pytensor
-    import pytensor.tensor as at
+    import pytensor.tensor as pt
     # We don't specify the dtype of our input variables, so it
     # defaults to using float64 without any special config.
-    a = at.scalar('a')
-    x = at.vector('x')
-    # `at.ivector` creates a symbolic vector of integers.
-    y = at.ivector('y')
+    a = pt.scalar('a')
+    x = pt.vector('x')
+    # `pt.ivector` creates a symbolic vector of integers.
+    y = pt.ivector('y')
 
 Next, we use those variables to build up a symbolic representation
 of the output of our function. Note that no computation is actually
@@ -48,11 +48,11 @@ being done at this point. We only record what operations we need to
 do to compute the output::
 
     inner = a * x**3 + y**2
-    out = at.exp(inner).sum()
+    out = pt.exp(inner).sum()
 
 .. note::
 
-   In this example we use `at.exp` to create a symbolic representation
+   In this example we use `pt.exp` to create a symbolic representation
    of the exponential of `inner`. Somewhat surprisingly, it
    would also have worked if we used `np.exp`. This is because numpy
    gives objects it operates on a chance to define the results of
@@ -77,8 +77,8 @@ We can call this function with actual arrays as many times as we want::
 
 For the most part the symbolic PyTensor variables can be operated on
 like NumPy arrays. Most NumPy functions are available in `pytensor.tensor`
-(which is typically imported as `at`). A lot of linear algebra operations
-can be found in `at.nlinalg` and `at.slinalg` (the NumPy and SciPy
+(which is typically imported as `pt`). A lot of linear algebra operations
+can be found in `pt.nlinalg` and `pt.slinalg` (the NumPy and SciPy
 operations respectively). Some support for sparse matrices is available
 in `pytensor.sparse`. For a detailed overview of available operations,
 see :mod:`the pytensor api docs <pytensor.tensor>`.
@@ -88,9 +88,9 @@ NumPy arrays are operations involving conditional execution.
 
 Code like this won't work as expected::
 
-    a = at.vector('a')
+    a = pt.vector('a')
     if (a > 0).all():
-        b = at.sqrt(a)
+        b = pt.sqrt(a)
     else:
         b = -a
 
@@ -100,17 +100,17 @@ and according to the rules for this conversion, things that aren't empty
 containers or zero are converted to `True`. So the code is equivalent
 to this::
 
-    a = at.vector('a')
-    b = at.sqrt(a)
+    a = pt.vector('a')
+    b = pt.sqrt(a)
 
-To get the desired behaviour, we can use `at.switch`::
+To get the desired behaviour, we can use `pt.switch`::
 
-    a = at.vector('a')
-    b = at.switch((a > 0).all(), at.sqrt(a), -a)
+    a = pt.vector('a')
+    b = pt.switch((a > 0).all(), pt.sqrt(a), -a)
 
 Indexing also works similarly to NumPy::
 
-    a = at.vector('a')
+    a = pt.vector('a')
     # Access the 10th element. This will fail when a function build
     # from this expression is executed with an array that is too short.
     b = a[10]
@@ -118,10 +118,10 @@ Indexing also works similarly to NumPy::
     # Extract a subvector
     b = a[[1, 2, 10]]
 
-Changing elements of an array is possible using `at.set_subtensor`::
+Changing elements of an array is possible using `pt.set_subtensor`::
 
-    a = at.vector('a')
-    b = at.set_subtensor(a[:10], 1)
+    a = pt.vector('a')
+    b = pt.set_subtensor(a[:10], 1)
 
     # is roughly equivalent to this (although pytensor avoids
     # the copy if `a` isn't used anymore)
@@ -167,7 +167,7 @@ this is happening::
     # in exactly this way!
     model = pm.Model()
 
-    mu = at.scalar('mu')
+    mu = pt.scalar('mu')
     model.add_free_variable(mu)
     model.add_logp_term(pm.Normal.dist(0, 1).logp(mu))
 
@@ -195,15 +195,15 @@ is roughly equivalent to this::
 
     # For illustration only, not real code!
     model = pm.Model()
-    mu = at.scalar('mu')
+    mu = pt.scalar('mu')
     model.add_free_variable(mu)
     model.add_logp_term(pm.Normal.dist(0, 1).logp(mu))
 
-    sd_log__ = at.scalar('sd_log__')
+    sd_log__ = pt.scalar('sd_log__')
     model.add_free_variable(sd_log__)
     model.add_logp_term(corrected_logp_half_normal(sd_log__))
 
-    sigma = at.exp(sd_log__)
+    sigma = pt.exp(sd_log__)
     model.add_deterministic_variable(sigma)
 
     model.add_logp_term(pm.Normal.dist(mu, sigma).logp(data))
@@ -214,8 +214,8 @@ PyTensor operation on them::
 
     design_matrix = np.array([[...]])
     with pm.Model() as model:
-        # beta is a at.dvector
+        # beta is a pt.dvector
         beta = pm.Normal('beta', 0, 1, shape=len(design_matrix))
-        predict = at.dot(design_matrix, beta)
+        predict = pt.dot(design_matrix, beta)
         sigma = pm.HalfCauchy('sigma', beta=2.5)
         pm.Normal('y', mu=predict, sigma=sigma, observed=data)
diff --git a/docs/source/api/distributions/continuous.rst b/docs/source/api/distributions/continuous.rst
index 8a5ce87621..7ebcb0f206 100644
--- a/docs/source/api/distributions/continuous.rst
+++ b/docs/source/api/distributions/continuous.rst
@@ -6,36 +6,36 @@ Continuous
 .. autosummary::
    :toctree: generated/
 
-   Uniform
-   Flat
-   HalfFlat
-   Normal
-   TruncatedNormal
+   AsymmetricLaplace
    Beta
-   Kumaraswamy
-   Exponential
-   Laplace
-   StudentT
    Cauchy
-   HalfCauchy
-   Gamma
-   Weibull
-   HalfStudentT
-   LogNormal
    ChiSquared
-   HalfNormal
-   Wald
-   Pareto
-   InverseGamma
    ExGaussian
-   VonMises
-   SkewNormal
-   Triangular
+   Exponential
+   Flat
+   Gamma
    Gumbel
+   HalfCauchy
+   HalfFlat
+   HalfNormal
+   HalfStudentT
+   Interpolated
+   InverseGamma
+   Kumaraswamy
+   Laplace
    Logistic
    LogitNormal
-   Interpolated
-   Rice
+   LogNormal
    Moyal
-   AsymmetricLaplace
+   Normal
+   Pareto
    PolyaGamma
+   Rice
+   SkewNormal
+   StudentT
+   Triangular
+   TruncatedNormal
+   Uniform
+   VonMises
+   Wald
+   Weibull
diff --git a/docs/source/api/distributions/discrete.rst b/docs/source/api/distributions/discrete.rst
index 4ced5053a1..2d8aca8636 100644
--- a/docs/source/api/distributions/discrete.rst
+++ b/docs/source/api/distributions/discrete.rst
@@ -6,19 +6,19 @@ Discrete
 .. autosummary::
    :toctree: generated
 
-   Binomial
-   BetaBinomial
    Bernoulli
-   DiscreteWeibull
-   Poisson
-   NegativeBinomial
+   BetaBinomial
+   Binomial
+   Categorical
    DiracDelta
-   ZeroInflatedPoisson
-   ZeroInflatedBinomial
-   ZeroInflatedNegativeBinomial
    DiscreteUniform
+   DiscreteWeibull
    Geometric
    HyperGeometric
-   Categorical
+   NegativeBinomial
    OrderedLogistic
    OrderedProbit
+   Poisson
+   ZeroInflatedBinomial
+   ZeroInflatedNegativeBinomial
+   ZeroInflatedPoisson
diff --git a/docs/source/api/distributions/multivariate.rst b/docs/source/api/distributions/multivariate.rst
index ac401b9944..02d909cdc9 100644
--- a/docs/source/api/distributions/multivariate.rst
+++ b/docs/source/api/distributions/multivariate.rst
@@ -6,18 +6,18 @@ Multivariate
 .. autosummary::
    :toctree: generated
 
-   MvNormal
-   MvStudentT
-   ZeroSumNormal
+   CAR
    Dirichlet
-   Multinomial
    DirichletMultinomial
-   OrderedMultinomial
-   Wishart
-   WishartBartlett
-   LKJCorr
+   KroneckerNormal
    LKJCholeskyCov
+   LKJCorr
    MatrixNormal
-   KroneckerNormal
-   CAR
+   Multinomial
+   MvNormal
+   MvStudentT
+   OrderedMultinomial
    StickBreakingWeights
+   Wishart
+   WishartBartlett
+   ZeroSumNormal
diff --git a/docs/source/api/distributions/timeseries.rst b/docs/source/api/distributions/timeseries.rst
index 8016b170f0..d0cbf6617c 100644
--- a/docs/source/api/distributions/timeseries.rst
+++ b/docs/source/api/distributions/timeseries.rst
@@ -7,8 +7,8 @@ Timeseries
    :toctree: generated
 
     AR
-    GaussianRandomWalk
-    GARCH11
     EulerMaruyama
+    GARCH11
+    GaussianRandomWalk
     MvGaussianRandomWalk
     MvStudentTRandomWalk
diff --git a/docs/source/api/distributions/transforms.rst b/docs/source/api/distributions/transforms.rst
index 141e8c9dd7..1a11a71b62 100644
--- a/docs/source/api/distributions/transforms.rst
+++ b/docs/source/api/distributions/transforms.rst
@@ -13,12 +13,12 @@ Transform instances are the entities that should be used in the
 .. autosummary::
    :toctree: generated
 
-    simplex
-    logodds
-    log_exp_m1
+    circular
     log
+    log_exp_m1
+    logodds
+    simplex
     sum_to_1
-    circular
 
 
 Specific Transform Classes
diff --git a/docs/source/api/distributions/utilities.rst b/docs/source/api/distributions/utilities.rst
index b5d3da0727..9d0a0a56ed 100644
--- a/docs/source/api/distributions/utilities.rst
+++ b/docs/source/api/distributions/utilities.rst
@@ -6,8 +6,8 @@ Distribution utilities
 .. autosummary::
    :toctree: generated/
 
-    Distribution
-    Discrete
     Continuous
     CustomDist
+    Discrete
+    Distribution
     SymbolicRandomVariable
diff --git a/docs/source/api/gp/implementations.rst b/docs/source/api/gp/implementations.rst
index 3b9f972845..03b59bbf9c 100644
--- a/docs/source/api/gp/implementations.rst
+++ b/docs/source/api/gp/implementations.rst
@@ -6,6 +6,7 @@ Implementations
 .. autosummary::
    :toctree: generated
 
+   HSGP
    Latent
    LatentKron
    Marginal
diff --git a/docs/source/api/shape_utils.rst b/docs/source/api/shape_utils.rst
index 7c6916b060..2908cb9c79 100644
--- a/docs/source/api/shape_utils.rst
+++ b/docs/source/api/shape_utils.rst
@@ -14,10 +14,6 @@ This module introduces functions that are made aware of the requested `size_tupl
    :toctree: generated/
 
    to_tuple
-   shapes_broadcasting
    broadcast_dist_samples_shape
-   get_broadcastable_dist_samples
-   broadcast_distribution_samples
-   broadcast_dist_samples_to
    rv_size_is_none
    change_dist_size
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 71091a8d24..210ed8764d 100755
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -76,6 +76,7 @@
     "SMC_kernel": ":ref:`SMC Kernel <smc_kernels>`",
     "PyTensor_Op": ":class:`PyTensor Op <pytensor.graph.op.Op>`",
     "tensor_like": ":term:`tensor_like`",
+    "unnamed_distribution": ":term:`unnamed_distribution`",
     "numpy_Generator": ":class:`~numpy.random.Generator`",
     "Distribution": ":ref:`Distribution <api_distributions>`",
 }
diff --git a/docs/source/contributing/gitpod/gitpod_integration.png b/docs/source/contributing/gitpod/gitpod_integration.png
new file mode 100644
index 0000000000..318297541a
Binary files /dev/null and b/docs/source/contributing/gitpod/gitpod_integration.png differ
diff --git a/docs/source/contributing/gitpod/gitpod_workspace.png b/docs/source/contributing/gitpod/gitpod_workspace.png
new file mode 100644
index 0000000000..c3cee505f6
Binary files /dev/null and b/docs/source/contributing/gitpod/gitpod_workspace.png differ
diff --git a/docs/source/contributing/implementing_distribution.md b/docs/source/contributing/implementing_distribution.md
index 78eec6b55f..85347622cd 100644
--- a/docs/source/contributing/implementing_distribution.md
+++ b/docs/source/contributing/implementing_distribution.md
@@ -129,7 +129,7 @@ Here is how the example continues:
 
 ```python
 
-import pytensor.tensor as at
+import pytensor.tensor as pt
 from pymc.pytensorf import floatX, intX
 from pymc.distributions.continuous import PositiveContinuous
 from pymc.distributions.dist_math import check_parameters
@@ -146,12 +146,12 @@ class Blah(PositiveContinuous):
     # We pass the standard parametrizations to super().dist
     @classmethod
     def dist(cls, param1, param2=None, alt_param2=None, **kwargs):
-        param1 = at.as_tensor_variable(intX(param1))
+        param1 = pt.as_tensor_variable(intX(param1))
         if param2 is not None and alt_param2 is not None:
             raise ValueError("Only one of param2 and alt_param2 is allowed.")
         if alt_param2 is not None:
             param2 = 1 / alt_param2
-        param2 = at.as_tensor_variable(floatX(param2))
+        param2 = pt.as_tensor_variable(floatX(param2))
 
         # The first value-only argument should be a list of the parameters that
         # the rv_op needs in order to be instantiated
@@ -161,19 +161,19 @@ class Blah(PositiveContinuous):
     # the variable, given the implicit `rv`, `size` and `param1` ... `paramN`.
     # This is typically a "representative" point such as the the mean or mode.
     def moment(rv, size, param1, param2):
-        moment, _ = at.broadcast_arrays(param1, param2)
+        moment, _ = pt.broadcast_arrays(param1, param2)
         if not rv_size_is_none(size):
-            moment = at.full(size, moment)
+            moment = pt.full(size, moment)
         return moment
 
     # Logp returns a symbolic expression for the elementwise log-pdf or log-pmf evaluation
     # of the variable given the `value` of the variable and the parameters `param1` ... `paramN`.
     def logp(value, param1, param2):
-        logp_expression = value * (param1 + at.log(param2))
+        logp_expression = value * (param1 + pt.log(param2))
 
         # A switch is often used to enforce the distribution support domain
-        bounded_logp_expression = at.switch(
-            at.gt(value >= 0),
+        bounded_logp_expression = pt.switch(
+            pt.gt(value >= 0),
             logp_expression,
             -np.inf,
         )
@@ -240,10 +240,11 @@ Most tests can be accommodated by the default `BaseTestDistributionRandom` class
 1. Shape variable inference is correct, via `check_rv_size`
 
 ```python
-from tests.distributions.util import BaseTestDistributionRandom, seeded_scipy_distribution_builder
 
-class TestBlah(BaseTestDistributionRandom):
+from pymc.testing import BaseTestDistributionRandom, seeded_scipy_distribution_builder
+
 
+class TestBlah(BaseTestDistributionRandom):
     pymc_dist = pm.Blah
     # Parameters with which to test the blah pymc Distribution
     pymc_dist_params = {"param1": 0.25, "param2": 2.0}
@@ -311,38 +312,36 @@ Tests for the `logp` and `logcdf` mostly make use of the helpers `check_logp`, `
 `check_selfconsistency_discrete_logcdf` implemented in `~tests.distributions.util`
 
 ```python
-from tests.helpers import select_by_precision
-from tests.distributions.util import check_logp, check_logcdf, Domain
+
+from pymc.testing import Domain, check_logp, check_logcdf, select_by_precision
 
 R = Domain([-np.inf, -2.1, -1, -0.01, 0.0, 0.01, 1, 2.1, np.inf])
 Rplus = Domain([0, 0.01, 0.1, 0.9, 0.99, 1, 1.5, 2, 100, np.inf])
 
 
-
 def test_blah():
-
-  check_logp(
-      pymc_dist=pm.Blah,
-      # Domain of the distribution values
-      domain=R,
-      # Domains of the distribution parameters
-      paramdomains={"mu": R, "sigma": Rplus},
-      # Reference scipy (or other) logp function
-      scipy_logp = lambda value, mu, sigma: sp.norm.logpdf(value, mu, sigma),
-      # Number of decimal points expected to match between the pymc and reference functions
-      decimal=select_by_precision(float64=6, float32=3),
-      # Maximum number of combinations of domain * paramdomains to test
-      n_samples=100,
-  )
-
-  check_logcdf(
-      pymc_dist=pm.Blah,
-      domain=R,
-      paramdomains={"mu": R, "sigma": Rplus},
-      scipy_logcdf=lambda value, mu, sigma: sp.norm.logcdf(value, mu, sigma),
-      decimal=select_by_precision(float64=6, float32=1),
-      n_samples=-1,
-  )
+    check_logp(
+        pymc_dist=pm.Blah,
+        # Domain of the distribution values
+        domain=R,
+        # Domains of the distribution parameters
+        paramdomains={"mu": R, "sigma": Rplus},
+        # Reference scipy (or other) logp function
+        scipy_logp=lambda value, mu, sigma: sp.norm.logpdf(value, mu, sigma),
+        # Number of decimal points expected to match between the pymc and reference functions
+        decimal=select_by_precision(float64=6, float32=3),
+        # Maximum number of combinations of domain * paramdomains to test
+        n_samples=100,
+    )
+
+    check_logcdf(
+        pymc_dist=pm.Blah,
+        domain=R,
+        paramdomains={"mu": R, "sigma": Rplus},
+        scipy_logcdf=lambda value, mu, sigma: sp.norm.logcdf(value, mu, sigma),
+        decimal=select_by_precision(float64=6, float32=1),
+        n_samples=-1,
+    )
 
 ```
 
@@ -382,7 +381,8 @@ which checks if:
 
 import pytest
 from pymc.distributions import Blah
-from tests.distributions.util import assert_moment_is_expected
+from pymc.testing import assert_moment_is_expected
+
 
 @pytest.mark.parametrize(
     "param1, param2, size, expected",
diff --git a/docs/source/contributing/index.md b/docs/source/contributing/index.md
index 975135a517..09162a8751 100644
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@@ -34,6 +34,7 @@ If you have other ideas let us know on [Discourse](https://discourse.pymc.io/) t
   companies, to people who could use PyMC in their work or making sure that academics who use PyMC
   cite it correctly in their work
 * Help with our fundraising efforts
+* Add timestamps to [videos from PyMCon](https://github.com/pymc-devs/video-timestamps)
 
 ### Contribute via Pull Requests on GitHub
 We have a {ref}`pr_tutorial` and a {ref}`pr_checklist` page to help in all the steps of the contributing
@@ -94,6 +95,7 @@ implementing_distribution
 build_docs
 running_the_test_suite
 review_pr_pymc_examples
+using_gitpod
 :::
 
 :::{toctree}
diff --git a/docs/source/contributing/using_gitpod.md b/docs/source/contributing/using_gitpod.md
new file mode 100644
index 0000000000..6cdafedb08
--- /dev/null
+++ b/docs/source/contributing/using_gitpod.md
@@ -0,0 +1,132 @@
+(using_gitpod)=
+# Using Gitpod
+
+## About Gitpod
+[Gitpod](https://www.gitpod.io/) is a browser-based development environment.
+
+These are some benefits to using Gitpod:
+
+- Bypass local computer configuration and technical issues
+- Save time by using a pre-configured virtual environment for contributing to open source
+- Save space on your local computer
+
+## Using Gitpod to Contribute to PyMC
+
+These instructions are for contributing specifically to the [pymc-devs/pymc](https://github.com/pymc-devs/pymc) repo.
+
+### Gitpod Workflow
+
+1. Fork the pymc repo: [https://github.com/pymc-devs/pymc](https://github.com/pymc-devs/pymc)
+
+2. Create a Gitpod account. You can login and authorize access via your GitHub account:  [https://gitpod.io/](https://gitpod.io/)
+
+    :::{note}
+    Gitpod will show up as an authorized application in your GitHub account here: [https://github.com/settings/applications](https://github.com/settings/applications)
+    :::
+
+3. Grant GitHub / Gitpod integration permissions.
+
+   a) Go to: [https://gitpod.io/user/integrations](https://gitpod.io/user/integrations)
+
+   b) Select GitHub and then "Edit Permissions"
+
+   c) Select these permission: `user:email`, `public_repo`, `repo`, `workflow`
+
+    :::{figure-md} Gitpod integration
+
+    ![gitpod_integration](gitpod/gitpod_integration.png)
+
+    Gitpod [integration options](https://gitpod.io/user/integrations)
+    :::
+
+4. Within Gitpod, create a ["New Workspace"](https://gitpod.io/workspaces).  Here you will want to select the forked pymc repo. If you do not see it, you can paste your forked repo path into the "Context URL" box.  For example:  `https://github.com/yourusername/pymc`.  Then select "New Workspace".
+
+    :::{figure-md} Gitpod workspace
+
+    ![gitpod_workspace](gitpod/gitpod_workspace.png)
+
+    Gitpod [workspace](https://gitpod.io/workspaces)
+    :::
+
+    :::{note}
+    Gitpod will pull a container and set up the workspace.  It will take a few minutes for the container to build.
+    :::
+
+5. Once Gitpod is up and running, the interface is similar to a Visual Studio Code (VSC) interface, which will appear in your browser. You will observe installation notices in the terminal window.  This can take 5-10 minutes. Once that is complete, the terminal will indicate you are on the "(base)" environment on Gitpod with your forked repo.
+
+    Here is an example:
+
+    ```console
+    (base) gitpod@reshamas-pymc-0ygu5rf74md:/workspace/pymc$
+    ```
+
+    :::{note}
+    This working environment has been set up with [micromamba](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html) which is a small, pure-C++ executable with enough functionalities to bootstrap fully functional conda-environments.
+    :::
+
+6. Check that your git remotes are correct with `git remote -v` at the terminal.
+
+    Example:
+
+    ```console
+    (base) gitpod@reshamas-pymc-0ygu5rf74md:/workspace/pymc$ git remote -v
+    origin  https://github.com/reshamas/pymc.git (fetch)
+    origin  https://github.com/reshamas/pymc.git (push)
+    upstream        https://github.com/pymc-devs/pymc.git (fetch)
+    upstream        https://github.com/pymc-devs/pymc.git (push)
+    (base) gitpod@reshamas-pymc-0ygu5rf74md:/workspace/pymc$
+    ```
+
+7. Check which version of python and pymc are being used at the terminal.
+
+    Check the version of pymc: `pip list | grep pymc`
+
+    Example:
+
+    ```console
+    (base) gitpod@reshamas-pymc-vpfb4pvr90z:/workspace/pymc$ pip list | grep pymc
+    pymc                          5.1.0       /workspace/pymc
+    pymc-sphinx-theme             0.1
+    ```
+
+    Check the version of python: `python3 --version`
+
+    Example:
+
+    ```console
+    (base) gitpod@reshamas-pymc-vpfb4pvr90z:/workspace/pymc$ python3 --version
+    Python 3.11.0
+    ```
+
+### Reminder: Git Workflow
+
+:::{attention}
+At the terminal, before beginning work, remember to create a feature branch:
+
+```console
+git checkout -b feature-branch
+```
+
+After working on a file, follow the Git workflow:
+
+- `git add file_name`
+- `git commit -m 'message'`
+- `git push origin feature-branch`
+:::
+
+### Resources
+- Video: [Using Gitpod to Contribute to PyMC](https://youtu.be/jsjOmhUaKuU) (15 minute video)
+
+### Gitpod Notes
+
+#### Billing
+The Gitpod free plan currently allows 500 free credits per month, which is 50 hours of standard workspace usage. Usage information can be found in the [Gitpod billing section](https://gitpod.io/user/billing).
+
+:::{caution}
+Be sure to check out the Gitpod policies on [Workspace Deletion](https://www.gitpod.io/docs/configure/workspaces/workspace-lifecycle#workspace-deletion) and learn more about:
+
+- "Starting" & "Stopping" workspaces
+- "Workplace Inactivity": By default, workspaces stop following 30 minutes without user input (e.g. keystrokes or terminal input commands). You can increase the workspace timeout up to a maximum of 24 hours.
+- Workspaces are deleted after 14 days. Pinned workspaces are never deleted automatically.
+- You can pin a workspace from your workspace list in the Gitpod dashboard.
+:::
diff --git a/docs/source/glossary.md b/docs/source/glossary.md
index c0dd156c8e..4e58cb7144 100644
--- a/docs/source/glossary.md
+++ b/docs/source/glossary.md
@@ -128,9 +128,30 @@ tensor_like
   Any scalar or sequence that can be interpreted as a {class}`~pytensor.tensor.TensorVariable`. In addition to TensorVariables, this includes NumPy ndarrays, scalars, lists and tuples (possibly nested). Any argument accepted by `pytensor.tensor.as_tensor_variable` is tensor_like.
 
   ```{jupyter-execute}
-  import pytensor.tensor as at
+  import pytensor.tensor as pt
 
-  at.as_tensor_variable([[1, 2.0], [0, 0]])
+  pt.as_tensor_variable([[1, 2.0], [0, 0]])
   ```
+unnamed_distribution
+    PyMC distributions can be initialized directly (e.g. `pm.Normal`) or using the `.dist` classmethod (e.g. `pm.Normal.dist`). Distributions initialized with the 1st method are registered as model parameters and thus, need to be given a name and be initialized within a model context. "unnamed_distributions" are distributions initialized with the 2nd method. These are standalone distributions, they are not parameters in any model and can be used to draw samples from a distribution by itself or as parameters to other distributions like mixtures or censored.
+
+    "unnamed_distributions" can be used outside the model context. For example:
+
+    ```{jupyter-execute}
+    import pymc as pm
+
+    unnamed_dist = pm.Normal.dist(mu=1, sigma=2)
+    pm.draw(unnamed_dist, draws=10)
+    ```
+
+    Trying to initialize a named distribution outside a model context raises a `TypeError`:
+
+    ```{jupyter-execute}
+    :raises: TypeError
+
+    import pymc as pm
+
+    pm.Normal("variable")
+    ```
 
 :::::
diff --git a/pymc/data.py b/pymc/data.py
index 4cd513bb8c..71ca2439fa 100644
--- a/pymc/data.py
+++ b/pymc/data.py
@@ -13,8 +13,6 @@
 #   limitations under the License.
 
 import io
-import os
-import pkgutil
 import urllib.request
 import warnings
 
@@ -24,7 +22,7 @@
 import numpy as np
 import pandas as pd
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import xarray as xr
 
 from pytensor.compile.sharedvalue import SharedVariable
@@ -63,12 +61,8 @@ def get_data(filename):
     -------
     BytesIO of the data
     """
-    data_pkg = "tests"
-    try:
-        content = pkgutil.get_data(data_pkg, os.path.join("data", filename))
-    except FileNotFoundError:
-        with urllib.request.urlopen(BASE_URL.format(filename=filename)) as handle:
-            content = handle.read()
+    with urllib.request.urlopen(BASE_URL.format(filename=filename)) as handle:
+        content = handle.read()
     return io.BytesIO(content)
 
 
@@ -170,7 +164,7 @@ def assert_all_scalars_equal(scalar, *scalars):
     else:
         return Assert(
             "All variables shape[0] in Minibatch should be equal, check your Minibatch(data1, data2, ...) code"
-        )(scalar, at.all([scalar == s for s in scalars]))
+        )(scalar, pt.all([scalar == s for s in scalars]))
 
 
 def Minibatch(variable: TensorVariable, *variables: TensorVariable, batch_size: int):
@@ -191,7 +185,7 @@ def Minibatch(variable: TensorVariable, *variables: TensorVariable, batch_size:
     >>> mdata1, mdata2 = Minibatch(data1, data2, batch_size=10)
     """
 
-    tensor, *tensors = tuple(map(at.as_tensor, (variable, *variables)))
+    tensor, *tensors = tuple(map(pt.as_tensor, (variable, *variables)))
     upper = assert_all_scalars_equal(*[t.shape[0] for t in (tensor, *tensors)])
     slc = minibatch_index(0, upper, size=batch_size)
     for i, v in enumerate((tensor, *tensors)):
@@ -441,7 +435,7 @@ def Data(
     if mutable:
         x = pytensor.shared(arr, name, **kwargs)
     else:
-        x = at.as_tensor_variable(arr, name, **kwargs)
+        x = pt.as_tensor_variable(arr, name, **kwargs)
 
     if isinstance(dims, str):
         dims = (dims,)
diff --git a/pymc/distributions/bound.py b/pymc/distributions/bound.py
index e427938ddf..a28b3647ad 100644
--- a/pymc/distributions/bound.py
+++ b/pymc/distributions/bound.py
@@ -14,7 +14,7 @@
 import warnings
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.tensor import as_tensor_variable
 from pytensor.tensor.random.op import RandomVariable
@@ -25,7 +25,7 @@
 from pymc.distributions.distribution import Continuous, Discrete
 from pymc.distributions.shape_utils import to_tuple
 from pymc.distributions.transforms import _default_transform
-from pymc.logprob.joint_logprob import logp
+from pymc.logprob.basic import logp
 from pymc.logprob.utils import ignore_logprob
 from pymc.model import modelcontext
 from pymc.pytensorf import floatX, intX
@@ -72,8 +72,8 @@ def logp(value, distribution, lower, upper):
         -------
         TensorVariable
         """
-        res = at.switch(
-            at.or_(at.lt(value, lower), at.gt(value, upper)),
+        res = pt.switch(
+            pt.or_(pt.lt(value, lower), pt.gt(value, upper)),
             -np.inf,
             logp(distribution, value),
         )
@@ -126,8 +126,8 @@ def logp(value, distribution, lower, upper):
         -------
         TensorVariable
         """
-        res = at.switch(
-            at.or_(at.lt(value, lower), at.gt(value, upper)),
+        res = pt.switch(
+            pt.or_(pt.lt(value, lower), pt.gt(value, upper)),
             -np.inf,
             logp(distribution, value),
         )
diff --git a/pymc/distributions/censored.py b/pymc/distributions/censored.py
index 98a10ce476..e9dfd8d5fd 100644
--- a/pymc/distributions/censored.py
+++ b/pymc/distributions/censored.py
@@ -12,7 +12,7 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.tensor import TensorVariable
 from pytensor.tensor.random.op import RandomVariable
@@ -52,15 +52,15 @@ class Censored(Distribution):
 
     Parameters
     ----------
-    dist: unnamed distribution
-        Univariate distribution created via the `.dist()` API, which will be censored.
+    dist : unnamed_distribution
+        Univariate distribution which will be censored.
         This distribution must have a logcdf method implemented for sampling.
 
         .. warning:: dist will be cloned, rendering it independent of the one passed as input.
 
-    lower: float or None
+    lower : float or None
         Lower (left) censoring point. If `None` the distribution will not be left censored
-    upper: float or None
+    upper : float or None
         Upper (right) censoring point. If `None`, the distribution will not be right censored.
 
     Warnings
@@ -101,16 +101,16 @@ def dist(cls, dist, lower, upper, **kwargs):
 
     @classmethod
     def rv_op(cls, dist, lower=None, upper=None, size=None):
-        lower = at.constant(-np.inf) if lower is None else at.as_tensor_variable(lower)
-        upper = at.constant(np.inf) if upper is None else at.as_tensor_variable(upper)
+        lower = pt.constant(-np.inf) if lower is None else pt.as_tensor_variable(lower)
+        upper = pt.constant(np.inf) if upper is None else pt.as_tensor_variable(upper)
 
         # When size is not specified, dist may have to be broadcasted according to lower/upper
-        dist_shape = size if size is not None else at.broadcast_shape(dist, lower, upper)
+        dist_shape = size if size is not None else pt.broadcast_shape(dist, lower, upper)
         dist = change_dist_size(dist, dist_shape)
 
         # Censoring is achieved by clipping the base distribution between lower and upper
         dist_, lower_, upper_ = dist.type(), lower.type(), upper.type()
-        censored_rv_ = at.clip(dist_, lower_, upper_)
+        censored_rv_ = pt.clip(dist_, lower_, upper_)
 
         return CensoredRV(
             inputs=[dist_, lower_, upper_],
@@ -129,22 +129,22 @@ def change_censored_size(cls, dist, new_size, expand=False):
 
 @_moment.register(CensoredRV)
 def moment_censored(op, rv, dist, lower, upper):
-    moment = at.switch(
-        at.eq(lower, -np.inf),
-        at.switch(
-            at.isinf(upper),
+    moment = pt.switch(
+        pt.eq(lower, -np.inf),
+        pt.switch(
+            pt.isinf(upper),
             # lower = -inf, upper = inf
             0,
             # lower = -inf, upper = x
             upper - 1,
         ),
-        at.switch(
-            at.eq(upper, np.inf),
+        pt.switch(
+            pt.eq(upper, np.inf),
             # lower = x, upper = inf
             lower + 1,
             # lower = x, upper = x
             (lower + upper) / 2,
         ),
     )
-    moment = at.full_like(dist, moment)
+    moment = pt.full_like(dist, moment)
     return moment
diff --git a/pymc/distributions/continuous.py b/pymc/distributions/continuous.py
index 63c93f4778..91d2193504 100644
--- a/pymc/distributions/continuous.py
+++ b/pymc/distributions/continuous.py
@@ -26,7 +26,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.basic import Apply, Variable
 from pytensor.graph.op import Op
@@ -56,7 +56,7 @@
 from pytensor.tensor.random.op import RandomVariable
 from pytensor.tensor.var import TensorConstant
 
-from pymc.logprob.abstract import _logprob, logcdf, logprob
+from pymc.logprob.abstract import _logcdf_helper, _logprob_helper
 
 try:
     from polyagamma import polyagamma_cdf, polyagamma_pdf, random_polyagamma
@@ -79,6 +79,8 @@ def polyagamma_cdf(*args, **kwargs):
 from pymc.distributions import transforms
 from pymc.distributions.dist_math import (
     SplineWrapper,
+    check_icdf_parameters,
+    check_icdf_value,
     check_parameters,
     clipped_beta_rvs,
     i0e,
@@ -181,13 +183,13 @@ def transform_params(*args):
             if isinstance(lower, TensorConstant) and np.all(lower.value == -np.inf):
                 lower = None
             else:
-                lower = at.as_tensor_variable(lower)
+                lower = pt.as_tensor_variable(lower)
 
         if upper is not None:
             if isinstance(upper, TensorConstant) and np.all(upper.value == np.inf):
                 upper = None
             else:
-                upper = at.as_tensor_variable(upper)
+                upper = pt.as_tensor_variable(upper)
 
         return lower, upper
 
@@ -202,7 +204,7 @@ def assert_negative_support(var, label, distname, value=-1e-6):
     )
     msg = f"The variable specified for {label} has negative support for {distname}, "
     msg += "likely making it unsuitable for this parameter."
-    return Assert(msg)(var, at.all(at.ge(var, 0.0)))
+    return Assert(msg)(var, pt.all(pt.ge(var, 0.0)))
 
 
 def get_tau_sigma(tau=None, sigma=None):
@@ -232,24 +234,26 @@ def get_tau_sigma(tau=None, sigma=None):
             tau = 1.0
         else:
             if isinstance(sigma, Variable):
-                sigma_ = check_parameters(sigma, sigma > 0, msg="sigma > 0")
+                # Keep tau negative, if sigma was negative, so that it will fail when used
+                tau = (sigma**-2.0) * pt.sign(sigma)
             else:
                 sigma_ = np.asarray(sigma)
                 if np.any(sigma_ <= 0):
                     raise ValueError("sigma must be positive")
-            tau = sigma_**-2.0
+                tau = sigma_**-2.0
 
     else:
         if sigma is not None:
             raise ValueError("Can't pass both tau and sigma")
         else:
             if isinstance(tau, Variable):
-                tau_ = check_parameters(tau, tau > 0, msg="tau > 0")
+                # Keep sigma negative, if tau was negative, so that it will fail when used
+                sigma = pt.abs(tau) ** (-0.5) * pt.sign(tau)
             else:
                 tau_ = np.asarray(tau)
                 if np.any(tau_ <= 0):
                     raise ValueError("tau must be positive")
-            sigma = tau_**-0.5
+                sigma = tau_**-0.5
 
     return floatX(tau), floatX(sigma)
 
@@ -302,21 +306,21 @@ class Uniform(BoundedContinuous):
 
     @classmethod
     def dist(cls, lower=0, upper=1, **kwargs):
-        lower = at.as_tensor_variable(floatX(lower))
-        upper = at.as_tensor_variable(floatX(upper))
+        lower = pt.as_tensor_variable(floatX(lower))
+        upper = pt.as_tensor_variable(floatX(upper))
         return super().dist([lower, upper], **kwargs)
 
     def moment(rv, size, lower, upper):
-        lower, upper = at.broadcast_arrays(lower, upper)
+        lower, upper = pt.broadcast_arrays(lower, upper)
         moment = (lower + upper) / 2
         if not rv_size_is_none(size):
-            moment = at.full(size, moment)
+            moment = pt.full(size, moment)
         return moment
 
     def logp(value, lower, upper):
-        res = at.switch(
-            at.bitwise_and(at.ge(value, lower), at.le(value, upper)),
-            at.fill(value, -at.log(upper - lower)),
+        res = pt.switch(
+            pt.bitwise_and(pt.ge(value, lower), pt.le(value, upper)),
+            pt.fill(value, -pt.log(upper - lower)),
             -np.inf,
         )
 
@@ -327,12 +331,12 @@ def logp(value, lower, upper):
         )
 
     def logcdf(value, lower, upper):
-        res = at.switch(
-            at.lt(value, lower),
+        res = pt.switch(
+            pt.lt(value, lower),
             -np.inf,
-            at.switch(
-                at.lt(value, upper),
-                at.log(value - lower) - at.log(upper - lower),
+            pt.switch(
+                pt.lt(value, upper),
+                pt.log(value - lower) - pt.log(upper - lower),
                 0,
             ),
         )
@@ -343,6 +347,11 @@ def logcdf(value, lower, upper):
             msg="lower <= upper",
         )
 
+    def icdf(value, lower, upper):
+        res = lower + (upper - lower) * value
+        res = check_icdf_value(res, value)
+        return check_icdf_parameters(res, lower < upper)
+
 
 @_default_transform.register(Uniform)
 def uniform_default_transform(op, rv):
@@ -382,14 +391,14 @@ def dist(cls, **kwargs):
         return res
 
     def moment(rv, size):
-        return at.zeros(size)
+        return pt.zeros(size)
 
     def logp(value):
-        return at.zeros_like(value)
+        return pt.zeros_like(value)
 
     def logcdf(value):
-        return at.switch(
-            at.eq(value, -np.inf), -np.inf, at.switch(at.eq(value, np.inf), 0, at.log(0.5))
+        return pt.switch(
+            pt.eq(value, -np.inf), -np.inf, pt.switch(pt.eq(value, np.inf), 0, pt.log(0.5))
         )
 
 
@@ -423,13 +432,13 @@ def dist(cls, **kwargs):
         return res
 
     def moment(rv, size):
-        return at.ones(size)
+        return pt.ones(size)
 
     def logp(value):
-        return at.switch(at.lt(value, 0), -np.inf, at.zeros_like(value))
+        return pt.switch(pt.lt(value, 0), -np.inf, pt.zeros_like(value))
 
     def logcdf(value):
-        return at.switch(at.lt(value, np.inf), -np.inf, at.switch(at.eq(value, np.inf), 0, -np.inf))
+        return pt.switch(pt.lt(value, np.inf), -np.inf, pt.switch(pt.eq(value, np.inf), 0, -np.inf))
 
 
 class Normal(Continuous):
@@ -502,22 +511,22 @@ class Normal(Continuous):
     @classmethod
     def dist(cls, mu=0, sigma=None, tau=None, **kwargs):
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        sigma = at.as_tensor_variable(sigma)
+        sigma = pt.as_tensor_variable(sigma)
 
-        # tau = at.as_tensor_variable(tau)
-        # mean = median = mode = mu = at.as_tensor_variable(floatX(mu))
+        # tau = pt.as_tensor_variable(tau)
+        # mean = median = mode = mu = pt.as_tensor_variable(floatX(mu))
         # variance = 1.0 / self.tau
 
         return super().dist([mu, sigma], **kwargs)
 
     def moment(rv, size, mu, sigma):
-        mu, _ = at.broadcast_arrays(mu, sigma)
+        mu, _ = pt.broadcast_arrays(mu, sigma)
         if not rv_size_is_none(size):
-            mu = at.full(size, mu)
+            mu = pt.full(size, mu)
         return mu
 
     def logp(value, mu, sigma):
-        res = -0.5 * at.pow((value - mu) / sigma, 2) - at.log(at.sqrt(2.0 * np.pi)) - at.log(sigma)
+        res = -0.5 * pt.pow((value - mu) / sigma, 2) - pt.log(pt.sqrt(2.0 * np.pi)) - pt.log(sigma)
         return check_parameters(
             res,
             sigma > 0,
@@ -532,7 +541,13 @@ def logcdf(value, mu, sigma):
         )
 
     def icdf(value, mu, sigma):
-        return mu + sigma * -np.sqrt(2.0) * at.erfcinv(2 * value)
+        res = mu + sigma * -np.sqrt(2.0) * pt.erfcinv(2 * value)
+        res = check_icdf_value(res, value)
+        return check_icdf_parameters(
+            res,
+            sigma > 0,
+            msg="sigma > 0",
+        )
 
 
 class TruncatedNormalRV(RandomVariable):
@@ -657,27 +672,27 @@ def dist(
         **kwargs,
     ) -> RandomVariable:
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        sigma = at.as_tensor_variable(sigma)
-        tau = at.as_tensor_variable(tau)
-        mu = at.as_tensor_variable(floatX(mu))
+        sigma = pt.as_tensor_variable(sigma)
+        tau = pt.as_tensor_variable(tau)
+        mu = pt.as_tensor_variable(floatX(mu))
 
-        lower = at.as_tensor_variable(floatX(lower)) if lower is not None else at.constant(-np.inf)
-        upper = at.as_tensor_variable(floatX(upper)) if upper is not None else at.constant(np.inf)
+        lower = pt.as_tensor_variable(floatX(lower)) if lower is not None else pt.constant(-np.inf)
+        upper = pt.as_tensor_variable(floatX(upper)) if upper is not None else pt.constant(np.inf)
         return super().dist([mu, sigma, lower, upper], **kwargs)
 
     def moment(rv, size, mu, sigma, lower, upper):
-        mu, _, lower, upper = at.broadcast_arrays(mu, sigma, lower, upper)
-        moment = at.switch(
-            at.eq(lower, -np.inf),
-            at.switch(
-                at.eq(upper, np.inf),
+        mu, _, lower, upper = pt.broadcast_arrays(mu, sigma, lower, upper)
+        moment = pt.switch(
+            pt.eq(lower, -np.inf),
+            pt.switch(
+                pt.eq(upper, np.inf),
                 # lower = -inf, upper = inf
                 mu,
                 # lower = -inf, upper = x
                 upper - 1,
             ),
-            at.switch(
-                at.eq(upper, np.inf),
+            pt.switch(
+                pt.eq(upper, np.inf),
                 # lower = x, upper = inf
                 lower + 1,
                 # lower = x, upper = x
@@ -686,7 +701,7 @@ def moment(rv, size, mu, sigma, lower, upper):
         )
 
         if not rv_size_is_none(size):
-            moment = at.full(size, moment)
+            moment = pt.full(size, moment)
 
         return moment
 
@@ -701,7 +716,7 @@ def logp(value, mu, sigma, lower, upper):
             lcdf_b = normal_lcdf(mu, sigma, upper)
             lsf_a = normal_lccdf(mu, sigma, lower)
             lsf_b = normal_lccdf(mu, sigma, upper)
-            norm = at.switch(lower > 0, logdiffexp(lsf_a, lsf_b), logdiffexp(lcdf_b, lcdf_a))
+            norm = pt.switch(lower > 0, logdiffexp(lsf_a, lsf_b), logdiffexp(lcdf_b, lcdf_a))
         elif is_lower_bounded:
             norm = normal_lccdf(mu, sigma, lower)
         elif is_upper_bounded:
@@ -709,18 +724,18 @@ def logp(value, mu, sigma, lower, upper):
         else:
             norm = 0.0
 
-        logp = _logprob(normal, (value,), None, None, None, mu, sigma) - norm
+        logp = _logprob_helper(Normal.dist(mu, sigma), value) - norm
 
         if is_lower_bounded:
-            logp = at.switch(value < lower, -np.inf, logp)
+            logp = pt.switch(value < lower, -np.inf, logp)
 
         if is_upper_bounded:
-            logp = at.switch(value > upper, -np.inf, logp)
+            logp = pt.switch(value > upper, -np.inf, logp)
 
         if is_lower_bounded and is_upper_bounded:
             logp = check_parameters(
                 logp,
-                at.le(lower, upper),
+                pt.le(lower, upper),
                 msg="lower_bound <= upper_bound",
             )
 
@@ -801,7 +816,13 @@ class HalfNormal(PositiveContinuous):
     rv_op = halfnormal
 
     @classmethod
-    def dist(cls, sigma=None, tau=None, *args, **kwargs):
+    def dist(
+        cls,
+        sigma: Optional[DIST_PARAMETER_TYPES] = None,
+        tau: Optional[DIST_PARAMETER_TYPES] = None,
+        *args,
+        **kwargs,
+    ):
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
         return super().dist([0.0, sigma], **kwargs)
@@ -809,12 +830,12 @@ def dist(cls, sigma=None, tau=None, *args, **kwargs):
     def moment(rv, size, loc, sigma):
         moment = loc + sigma
         if not rv_size_is_none(size):
-            moment = at.full(size, moment)
+            moment = pt.full(size, moment)
         return moment
 
     def logp(value, loc, sigma):
-        res = -0.5 * at.pow((value - loc) / sigma, 2) + at.log(at.sqrt(2.0 / np.pi)) - at.log(sigma)
-        res = at.switch(at.ge(value, loc), res, -np.inf)
+        res = -0.5 * pt.pow((value - loc) / sigma, 2) + pt.log(pt.sqrt(2.0 / np.pi)) - pt.log(sigma)
+        res = pt.switch(pt.ge(value, loc), res, -np.inf)
         return check_parameters(
             res,
             sigma > 0,
@@ -823,10 +844,10 @@ def logp(value, loc, sigma):
 
     def logcdf(value, loc, sigma):
         z = zvalue(value, mu=loc, sigma=sigma)
-        logcdf = at.switch(
-            at.lt(value, loc),
+        logcdf = pt.switch(
+            pt.lt(value, loc),
             -np.inf,
-            at.log1p(-at.erfc(z / at.sqrt(2.0))),
+            pt.log1p(-pt.erfc(z / pt.sqrt(2.0))),
         )
 
         return check_parameters(
@@ -933,17 +954,24 @@ class Wald(PositiveContinuous):
     rv_op = wald
 
     @classmethod
-    def dist(cls, mu=None, lam=None, phi=None, alpha=0.0, **kwargs):
+    def dist(
+        cls,
+        mu: Optional[DIST_PARAMETER_TYPES] = None,
+        lam: Optional[DIST_PARAMETER_TYPES] = None,
+        phi: Optional[DIST_PARAMETER_TYPES] = None,
+        alpha: Optional[DIST_PARAMETER_TYPES] = 0.0,
+        **kwargs,
+    ):
         mu, lam, phi = cls.get_mu_lam_phi(mu, lam, phi)
-        alpha = at.as_tensor_variable(floatX(alpha))
-        mu = at.as_tensor_variable(floatX(mu))
-        lam = at.as_tensor_variable(floatX(lam))
+        alpha = pt.as_tensor_variable(floatX(alpha))
+        mu = pt.as_tensor_variable(floatX(mu))
+        lam = pt.as_tensor_variable(floatX(lam))
         return super().dist([mu, lam, alpha], **kwargs)
 
     def moment(rv, size, mu, lam, alpha):
-        mu, _, _ = at.broadcast_arrays(mu, lam, alpha)
+        mu, _, _ = pt.broadcast_arrays(mu, lam, alpha)
         if not rv_size_is_none(size):
-            mu = at.full(size, mu)
+            mu = pt.full(size, mu)
         return mu
 
     @staticmethod
@@ -968,8 +996,8 @@ def get_mu_lam_phi(mu, lam, phi):
 
     def logp(value, mu, lam, alpha):
         centered_value = value - alpha
-        logp = at.switch(
-            at.le(centered_value, 0),
+        logp = pt.switch(
+            pt.le(centered_value, 0),
             -np.inf,
             (
                 logpow(lam / (2.0 * np.pi), 0.5)
@@ -990,17 +1018,17 @@ def logcdf(value, mu, lam, alpha):
         value -= alpha
         q = value / mu
         l = lam * mu
-        r = at.sqrt(value * lam)
+        r = pt.sqrt(value * lam)
 
         a = normal_lcdf(0, 1, (q - 1.0) / r)
         b = 2.0 / l + normal_lcdf(0, 1, -(q + 1.0) / r)
 
-        logcdf = at.switch(
-            at.le(value, 0),
+        logcdf = pt.switch(
+            pt.le(value, 0),
             -np.inf,
-            at.switch(
-                at.lt(value, np.inf),
-                a + at.log1pexp(b - a),
+            pt.switch(
+                pt.lt(value, np.inf),
+                a + pt.log1pexp(b - a),
                 0,
             ),
         )
@@ -1034,6 +1062,10 @@ class Beta(UnitContinuous):
        f(x \mid \alpha, \beta) =
            \frac{x^{\alpha - 1} (1 - x)^{\beta - 1}}{B(\alpha, \beta)}
 
+    where :math:`B` is the Beta function.
+
+    For more information, see https://en.wikipedia.org/wiki/Beta_distribution.
+
     .. plot::
         :context: close-figs
 
@@ -1096,17 +1128,26 @@ class Beta(UnitContinuous):
     rv_op = pytensor.tensor.random.beta
 
     @classmethod
-    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, nu=None, *args, **kwargs):
+    def dist(
+        cls,
+        alpha: Optional[DIST_PARAMETER_TYPES] = None,
+        beta: Optional[DIST_PARAMETER_TYPES] = None,
+        mu: Optional[DIST_PARAMETER_TYPES] = None,
+        sigma: Optional[DIST_PARAMETER_TYPES] = None,
+        nu: Optional[DIST_PARAMETER_TYPES] = None,
+        *args,
+        **kwargs,
+    ):
         alpha, beta = cls.get_alpha_beta(alpha, beta, mu, sigma, nu)
-        alpha = at.as_tensor_variable(floatX(alpha))
-        beta = at.as_tensor_variable(floatX(beta))
+        alpha = pt.as_tensor_variable(floatX(alpha))
+        beta = pt.as_tensor_variable(floatX(beta))
 
         return super().dist([alpha, beta], **kwargs)
 
     def moment(rv, size, alpha, beta):
         mean = alpha / (alpha + beta)
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     @classmethod
@@ -1131,11 +1172,11 @@ def get_alpha_beta(self, alpha=None, beta=None, mu=None, sigma=None, nu=None):
 
     def logp(value, alpha, beta):
         res = (
-            at.switch(at.eq(alpha, 1.0), 0.0, (alpha - 1.0) * at.log(value))
-            + at.switch(at.eq(beta, 1.0), 0.0, (beta - 1.0) * at.log1p(-value))
-            - (at.gammaln(alpha) + at.gammaln(beta) - at.gammaln(alpha + beta))
+            pt.switch(pt.eq(alpha, 1.0), 0.0, (alpha - 1.0) * pt.log(value))
+            + pt.switch(pt.eq(beta, 1.0), 0.0, (beta - 1.0) * pt.log1p(-value))
+            - (pt.gammaln(alpha) + pt.gammaln(beta) - pt.gammaln(alpha + beta))
         )
-        res = at.switch(at.bitwise_and(at.ge(value, 0.0), at.le(value, 1.0)), res, -np.inf)
+        res = pt.switch(pt.bitwise_and(pt.ge(value, 0.0), pt.le(value, 1.0)), res, -np.inf)
         return check_parameters(
             res,
             alpha > 0,
@@ -1144,12 +1185,12 @@ def logp(value, alpha, beta):
         )
 
     def logcdf(value, alpha, beta):
-        logcdf = at.switch(
-            at.lt(value, 0),
+        logcdf = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.switch(
-                at.lt(value, 1),
-                at.log(at.betainc(alpha, beta, value)),
+            pt.switch(
+                pt.lt(value, 1),
+                pt.log(pt.betainc(alpha, beta, value)),
                 0,
             ),
         )
@@ -1224,22 +1265,22 @@ class Kumaraswamy(UnitContinuous):
     rv_op = kumaraswamy
 
     @classmethod
-    def dist(cls, a, b, *args, **kwargs):
-        a = at.as_tensor_variable(floatX(a))
-        b = at.as_tensor_variable(floatX(b))
+    def dist(cls, a: DIST_PARAMETER_TYPES, b: DIST_PARAMETER_TYPES, *args, **kwargs):
+        a = pt.as_tensor_variable(floatX(a))
+        b = pt.as_tensor_variable(floatX(b))
 
         return super().dist([a, b], *args, **kwargs)
 
     def moment(rv, size, a, b):
-        mean = at.exp(at.log(b) + at.gammaln(1 + 1 / a) + at.gammaln(b) - at.gammaln(1 + 1 / a + b))
+        mean = pt.exp(pt.log(b) + pt.gammaln(1 + 1 / a) + pt.gammaln(b) - pt.gammaln(1 + 1 / a + b))
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, a, b):
-        res = at.log(a) + at.log(b) + (a - 1) * at.log(value) + (b - 1) * at.log(1 - value**a)
-        res = at.switch(
-            at.or_(at.lt(value, 0), at.gt(value, 1)),
+        res = pt.log(a) + pt.log(b) + (a - 1) * pt.log(value) + (b - 1) * pt.log(1 - value**a)
+        res = pt.switch(
+            pt.or_(pt.lt(value, 0), pt.gt(value, 1)),
             -np.inf,
             res,
         )
@@ -1251,12 +1292,12 @@ def logp(value, a, b):
         )
 
     def logcdf(value, a, b):
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.switch(
-                at.lt(value, 1),
-                at.log1mexp(b * at.log1p(-(value**a))),
+            pt.switch(
+                pt.lt(value, 1),
+                pt.log1mexp(b * pt.log1p(-(value**a))),
                 0,
             ),
         )
@@ -1310,20 +1351,20 @@ class Exponential(PositiveContinuous):
     rv_op = exponential
 
     @classmethod
-    def dist(cls, lam, *args, **kwargs):
-        lam = at.as_tensor_variable(floatX(lam))
+    def dist(cls, lam: DIST_PARAMETER_TYPES, *args, **kwargs):
+        lam = pt.as_tensor_variable(floatX(lam))
 
         # PyTensor exponential op is parametrized in terms of mu (1/lam)
-        return super().dist([at.reciprocal(lam)], **kwargs)
+        return super().dist([pt.reciprocal(lam)], **kwargs)
 
     def moment(rv, size, mu):
         if not rv_size_is_none(size):
-            mu = at.full(size, mu)
+            mu = pt.full(size, mu)
         return mu
 
     def logp(value, mu):
-        res = -at.log(mu) - value / mu
-        res = at.switch(at.ge(value, 0.0), res, -np.inf)
+        res = -pt.log(mu) - value / mu
+        res = pt.switch(pt.ge(value, 0.0), res, -np.inf)
         return check_parameters(
             res,
             mu >= 0,
@@ -1331,11 +1372,11 @@ def logp(value, mu):
         )
 
     def logcdf(value, mu):
-        lam = at.reciprocal(mu)
-        res = at.switch(
-            at.lt(value, 0),
+        lam = pt.reciprocal(mu)
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.log1mexp(-lam * value),
+            pt.log1mexp(-lam * value),
         )
 
         return check_parameters(
@@ -1344,6 +1385,15 @@ def logcdf(value, mu):
             msg="lam >= 0",
         )
 
+    def icdf(value, mu):
+        res = -mu * pt.log(1 - value)
+        res = check_icdf_value(res, value)
+        return check_icdf_parameters(
+            res,
+            mu >= 0,
+            msg="mu >= 0",
+        )
+
 
 class Laplace(Continuous):
     r"""
@@ -1392,19 +1442,19 @@ class Laplace(Continuous):
 
     @classmethod
     def dist(cls, mu, b, *args, **kwargs):
-        b = at.as_tensor_variable(floatX(b))
-        mu = at.as_tensor_variable(floatX(mu))
+        b = pt.as_tensor_variable(floatX(b))
+        mu = pt.as_tensor_variable(floatX(mu))
 
         return super().dist([mu, b], *args, **kwargs)
 
     def moment(rv, size, mu, b):
-        mu, _ = at.broadcast_arrays(mu, b)
+        mu, _ = pt.broadcast_arrays(mu, b)
         if not rv_size_is_none(size):
-            mu = at.full(size, mu)
+            mu = pt.full(size, mu)
         return mu
 
     def logp(value, mu, b):
-        res = -at.log(2 * b) - at.abs(value - mu) / b
+        res = -pt.log(2 * b) - pt.abs(value - mu) / b
         return check_parameters(
             res,
             b > 0,
@@ -1414,13 +1464,13 @@ def logp(value, mu, b):
     def logcdf(value, mu, b):
         y = (value - mu) / b
 
-        res = at.switch(
-            at.le(value, mu),
-            at.log(0.5) + y,
-            at.switch(
-                at.gt(y, 1),
-                at.log1p(-0.5 * at.exp(-y)),
-                at.log(1 - 0.5 * at.exp(-y)),
+        res = pt.switch(
+            pt.le(value, mu),
+            pt.log(0.5) + y,
+            pt.switch(
+                pt.gt(y, 1),
+                pt.log1p(-0.5 * pt.exp(-y)),
+                pt.log(1 - 0.5 * pt.exp(-y)),
             ),
         )
 
@@ -1501,9 +1551,9 @@ class AsymmetricLaplace(Continuous):
     @classmethod
     def dist(cls, kappa=None, mu=None, b=None, q=None, *args, **kwargs):
         kappa = cls.get_kappa(kappa, q)
-        b = at.as_tensor_variable(floatX(b))
-        kappa = at.as_tensor_variable(floatX(kappa))
-        mu = at.as_tensor_variable(floatX(mu))
+        b = pt.as_tensor_variable(floatX(b))
+        kappa = pt.as_tensor_variable(floatX(kappa))
+        mu = pt.as_tensor_variable(floatX(mu))
 
         return super().dist([b, kappa, mu], *args, **kwargs)
 
@@ -1527,13 +1577,13 @@ def moment(rv, size, b, kappa, mu):
         mean = mu - (kappa - 1 / kappa) / b
 
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, b, kappa, mu):
         value = value - mu
-        res = at.log(b / (kappa + (kappa**-1))) + (
-            -value * b * at.sgn(value) * (kappa ** at.sgn(value))
+        res = pt.log(b / (kappa + (kappa**-1))) + (
+            -value * b * pt.sgn(value) * (kappa ** pt.sgn(value))
         )
 
         return check_parameters(
@@ -1617,25 +1667,25 @@ class LogNormal(PositiveContinuous):
     def dist(cls, mu=0, sigma=None, tau=None, *args, **kwargs):
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
-        mu = at.as_tensor_variable(floatX(mu))
-        sigma = at.as_tensor_variable(floatX(sigma))
+        mu = pt.as_tensor_variable(floatX(mu))
+        sigma = pt.as_tensor_variable(floatX(sigma))
 
         return super().dist([mu, sigma], *args, **kwargs)
 
     def moment(rv, size, mu, sigma):
-        mean = at.exp(mu + 0.5 * sigma**2)
+        mean = pt.exp(mu + 0.5 * sigma**2)
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, mu, sigma):
         res = (
-            -0.5 * at.pow((at.log(value) - mu) / sigma, 2)
-            - 0.5 * at.log(2.0 * np.pi)
-            - at.log(sigma)
-            - at.log(value)
+            -0.5 * pt.pow((pt.log(value) - mu) / sigma, 2)
+            - 0.5 * pt.log(2.0 * np.pi)
+            - pt.log(sigma)
+            - pt.log(value)
         )
-        res = at.switch(at.gt(value, 0.0), res, -np.inf)
+        res = pt.switch(pt.gt(value, 0.0), res, -np.inf)
         return check_parameters(
             res,
             sigma > 0,
@@ -1643,10 +1693,10 @@ def logp(value, mu, sigma):
         )
 
     def logcdf(value, mu, sigma):
-        res = at.switch(
-            at.le(value, 0),
+        res = pt.switch(
+            pt.le(value, 0),
             -np.inf,
-            normal_lcdf(mu, sigma, at.log(value)),
+            normal_lcdf(mu, sigma, pt.log(value)),
         )
 
         return check_parameters(
@@ -1742,16 +1792,16 @@ class StudentT(Continuous):
 
     @classmethod
     def dist(cls, nu, mu=0, *, sigma=None, lam=None, **kwargs):
-        nu = at.as_tensor_variable(floatX(nu))
+        nu = pt.as_tensor_variable(floatX(nu))
         lam, sigma = get_tau_sigma(tau=lam, sigma=sigma)
-        sigma = at.as_tensor_variable(sigma)
+        sigma = pt.as_tensor_variable(sigma)
 
         return super().dist([nu, mu, sigma], **kwargs)
 
     def moment(rv, size, nu, mu, sigma):
-        mu, _, _ = at.broadcast_arrays(mu, nu, sigma)
+        mu, _, _ = pt.broadcast_arrays(mu, nu, sigma)
         if not rv_size_is_none(size):
-            mu = at.full(size, mu)
+            mu = pt.full(size, mu)
         return mu
 
     def logp(value, nu, mu, sigma):
@@ -1759,9 +1809,9 @@ def logp(value, nu, mu, sigma):
 
         res = (
             gammaln((nu + 1.0) / 2.0)
-            + 0.5 * at.log(lam / (nu * np.pi))
+            + 0.5 * pt.log(lam / (nu * np.pi))
             - gammaln(nu / 2.0)
-            - (nu + 1.0) / 2.0 * at.log1p(lam * (value - mu) ** 2 / nu)
+            - (nu + 1.0) / 2.0 * pt.log1p(lam * (value - mu) ** 2 / nu)
         )
 
         return check_parameters(
@@ -1775,10 +1825,10 @@ def logcdf(value, nu, mu, sigma):
         _, sigma = get_tau_sigma(sigma=sigma)
 
         t = (value - mu) / sigma
-        sqrt_t2_nu = at.sqrt(t**2 + nu)
+        sqrt_t2_nu = pt.sqrt(t**2 + nu)
         x = (t + sqrt_t2_nu) / (2.0 * sqrt_t2_nu)
 
-        res = at.log(at.betainc(nu / 2.0, nu / 2.0, x))
+        res = pt.log(pt.betainc(nu / 2.0, nu / 2.0, x))
 
         return check_parameters(
             res,
@@ -1839,20 +1889,20 @@ class Pareto(BoundedContinuous):
 
     @classmethod
     def dist(cls, alpha, m, **kwargs):
-        alpha = at.as_tensor_variable(floatX(alpha))
-        m = at.as_tensor_variable(floatX(m))
+        alpha = pt.as_tensor_variable(floatX(alpha))
+        m = pt.as_tensor_variable(floatX(m))
 
         return super().dist([alpha, m], **kwargs)
 
     def moment(rv, size, alpha, m):
         median = m * 2 ** (1 / alpha)
         if not rv_size_is_none(size):
-            median = at.full(size, median)
+            median = pt.full(size, median)
         return median
 
     def logp(value, alpha, m):
-        res = at.log(alpha) + logpow(m, alpha) - logpow(value, alpha + 1.0)
-        res = at.switch(at.ge(value, m), res, -np.inf)
+        res = pt.log(alpha) + logpow(m, alpha) - logpow(value, alpha + 1.0)
+        res = pt.switch(pt.ge(value, m), res, -np.inf)
         return check_parameters(
             res,
             alpha > 0,
@@ -1863,13 +1913,13 @@ def logp(value, alpha, m):
     def logcdf(value, alpha, m):
         arg = (m / value) ** alpha
 
-        res = at.switch(
-            at.lt(value, m),
+        res = pt.switch(
+            pt.lt(value, m),
             -np.inf,
-            at.switch(
-                at.le(arg, 1e-5),
-                at.log1p(-arg),
-                at.log(1 - arg),
+            pt.switch(
+                pt.le(arg, 1e-5),
+                pt.log1p(-arg),
+                pt.log(1 - arg),
             ),
         )
 
@@ -1936,19 +1986,19 @@ class Cauchy(Continuous):
 
     @classmethod
     def dist(cls, alpha, beta, *args, **kwargs):
-        alpha = at.as_tensor_variable(floatX(alpha))
-        beta = at.as_tensor_variable(floatX(beta))
+        alpha = pt.as_tensor_variable(floatX(alpha))
+        beta = pt.as_tensor_variable(floatX(beta))
 
         return super().dist([alpha, beta], **kwargs)
 
     def moment(rv, size, alpha, beta):
-        alpha, _ = at.broadcast_arrays(alpha, beta)
+        alpha, _ = pt.broadcast_arrays(alpha, beta)
         if not rv_size_is_none(size):
-            alpha = at.full(size, alpha)
+            alpha = pt.full(size, alpha)
         return alpha
 
     def logp(value, alpha, beta):
-        res = -at.log(np.pi) - at.log(beta) - at.log1p(at.pow((value - alpha) / beta, 2))
+        res = -pt.log(np.pi) - pt.log(beta) - pt.log1p(pt.pow((value - alpha) / beta, 2))
         return check_parameters(
             res,
             beta > 0,
@@ -1956,7 +2006,7 @@ def logp(value, alpha, beta):
         )
 
     def logcdf(value, alpha, beta):
-        res = at.log(0.5 + at.arctan((value - alpha) / beta) / np.pi)
+        res = pt.log(0.5 + pt.arctan((value - alpha) / beta) / np.pi)
         return check_parameters(
             res,
             beta > 0,
@@ -2007,17 +2057,17 @@ class HalfCauchy(PositiveContinuous):
 
     @classmethod
     def dist(cls, beta, *args, **kwargs):
-        beta = at.as_tensor_variable(floatX(beta))
+        beta = pt.as_tensor_variable(floatX(beta))
         return super().dist([0.0, beta], **kwargs)
 
     def moment(rv, size, loc, beta):
         if not rv_size_is_none(size):
-            beta = at.full(size, beta)
+            beta = pt.full(size, beta)
         return beta
 
     def logp(value, loc, beta):
-        res = at.log(2) + logprob(Cauchy.dist(loc, beta), value)
-        res = at.switch(at.ge(value, loc), res, -np.inf)
+        res = pt.log(2) + _logprob_helper(Cauchy.dist(loc, beta), value)
+        res = pt.switch(pt.ge(value, loc), res, -np.inf)
         return check_parameters(
             res,
             beta > 0,
@@ -2025,10 +2075,10 @@ def logp(value, loc, beta):
         )
 
     def logcdf(value, loc, beta):
-        res = at.switch(
-            at.lt(value, loc),
+        res = pt.switch(
+            pt.lt(value, loc),
             -np.inf,
-            at.log(2 * at.arctan((value - loc) / beta) / np.pi),
+            pt.log(2 * pt.arctan((value - loc) / beta) / np.pi),
         )
 
         return check_parameters(
@@ -2102,8 +2152,8 @@ class Gamma(PositiveContinuous):
     @classmethod
     def dist(cls, alpha=None, beta=None, mu=None, sigma=None, **kwargs):
         alpha, beta = cls.get_alpha_beta(alpha, beta, mu, sigma)
-        alpha = at.as_tensor_variable(floatX(alpha))
-        beta = at.as_tensor_variable(floatX(beta))
+        alpha = pt.as_tensor_variable(floatX(alpha))
+        beta = pt.as_tensor_variable(floatX(beta))
 
         # The PyTensor `GammaRV` `Op` will invert the `beta` parameter itself
         return super().dist([alpha, beta], **kwargs)
@@ -2132,13 +2182,13 @@ def moment(rv, size, alpha, inv_beta):
         # The PyTensor `GammaRV` `Op` inverts the `beta` parameter itself
         mean = alpha * inv_beta
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, alpha, inv_beta):
-        beta = at.reciprocal(inv_beta)
-        res = -at.gammaln(alpha) + logpow(beta, alpha) - beta * value + logpow(value, alpha - 1)
-        res = at.switch(at.ge(value, 0.0), res, -np.inf)
+        beta = pt.reciprocal(inv_beta)
+        res = -pt.gammaln(alpha) + logpow(beta, alpha) - beta * value + logpow(value, alpha - 1)
+        res = pt.switch(pt.ge(value, 0.0), res, -np.inf)
         return check_parameters(
             res,
             alpha > 0,
@@ -2147,11 +2197,11 @@ def logp(value, alpha, inv_beta):
         )
 
     def logcdf(value, alpha, inv_beta):
-        beta = at.reciprocal(inv_beta)
-        res = at.switch(
-            at.lt(value, 0),
+        beta = pt.reciprocal(inv_beta)
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.log(at.gammainc(alpha, beta * value)),
+            pt.log(pt.gammainc(alpha, beta * value)),
         )
 
         return check_parameters(res, 0 < alpha, 0 < beta, msg="alpha > 0, beta > 0")
@@ -2211,17 +2261,17 @@ class InverseGamma(PositiveContinuous):
     @classmethod
     def dist(cls, alpha=None, beta=None, mu=None, sigma=None, *args, **kwargs):
         alpha, beta = cls._get_alpha_beta(alpha, beta, mu, sigma)
-        alpha = at.as_tensor_variable(floatX(alpha))
-        beta = at.as_tensor_variable(floatX(beta))
+        alpha = pt.as_tensor_variable(floatX(alpha))
+        beta = pt.as_tensor_variable(floatX(beta))
 
         return super().dist([alpha, beta], **kwargs)
 
     def moment(rv, size, alpha, beta):
         mean = beta / (alpha - 1.0)
         mode = beta / (alpha + 1.0)
-        moment = at.switch(alpha > 1, mean, mode)
+        moment = pt.switch(alpha > 1, mean, mode)
         if not rv_size_is_none(size):
-            moment = at.full(size, moment)
+            moment = pt.full(size, moment)
         return moment
 
     @classmethod
@@ -2248,8 +2298,8 @@ def _get_alpha_beta(cls, alpha, beta, mu, sigma):
         return alpha, beta
 
     def logp(value, alpha, beta):
-        res = -at.gammaln(alpha) + logpow(beta, alpha) - beta / value + logpow(value, -alpha - 1)
-        res = at.switch(at.ge(value, 0.0), res, -np.inf)
+        res = -pt.gammaln(alpha) + logpow(beta, alpha) - beta / value + logpow(value, -alpha - 1)
+        res = pt.switch(pt.ge(value, 0.0), res, -np.inf)
         return check_parameters(
             res,
             alpha > 0,
@@ -2258,10 +2308,10 @@ def logp(value, alpha, beta):
         )
 
     def logcdf(value, alpha, beta):
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.log(at.gammaincc(alpha, beta / value)),
+            pt.log(pt.gammaincc(alpha, beta / value)),
         )
 
         return check_parameters(
@@ -2315,20 +2365,20 @@ class ChiSquared(PositiveContinuous):
 
     @classmethod
     def dist(cls, nu, *args, **kwargs):
-        nu = at.as_tensor_variable(floatX(nu))
+        nu = pt.as_tensor_variable(floatX(nu))
         return super().dist([nu], *args, **kwargs)
 
     def moment(rv, size, nu):
         moment = nu
         if not rv_size_is_none(size):
-            moment = at.full(size, moment)
+            moment = pt.full(size, moment)
         return moment
 
     def logp(value, nu):
-        return logprob(Gamma.dist(alpha=nu / 2, beta=0.5), value)
+        return _logprob_helper(Gamma.dist(alpha=nu / 2, beta=0.5), value)
 
     def logcdf(value, nu):
-        return logcdf(Gamma.dist(alpha=nu / 2, beta=0.5), value)
+        return _logcdf_helper(Gamma.dist(alpha=nu / 2, beta=0.5), value)
 
 
 # TODO: Remove this once logp for multiplication is working!
@@ -2400,24 +2450,24 @@ class Weibull(PositiveContinuous):
 
     @classmethod
     def dist(cls, alpha, beta, *args, **kwargs):
-        alpha = at.as_tensor_variable(floatX(alpha))
-        beta = at.as_tensor_variable(floatX(beta))
+        alpha = pt.as_tensor_variable(floatX(alpha))
+        beta = pt.as_tensor_variable(floatX(beta))
 
         return super().dist([alpha, beta], *args, **kwargs)
 
     def moment(rv, size, alpha, beta):
-        mean = beta * at.gamma(1 + 1 / alpha)
+        mean = beta * pt.gamma(1 + 1 / alpha)
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logcdf(value, alpha, beta):
         a = (value / beta) ** alpha
 
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.log1mexp(-a),
+            pt.log1mexp(-a),
         )
 
         return check_parameters(
@@ -2429,12 +2479,12 @@ def logcdf(value, alpha, beta):
 
     def logp(value, alpha, beta):
         res = (
-            at.log(alpha)
-            - at.log(beta)
-            + (alpha - 1.0) * at.log(value / beta)
-            - at.pow(value / beta, alpha)
+            pt.log(alpha)
+            - pt.log(beta)
+            + (alpha - 1.0) * pt.log(value / beta)
+            - pt.pow(value / beta, alpha)
         )
-        res = at.switch(at.ge(value, 0.0), res, -np.inf)
+        res = pt.switch(pt.ge(value, 0.0), res, -np.inf)
         return check_parameters(
             res,
             alpha > 0,
@@ -2452,7 +2502,7 @@ class HalfStudentTRV(RandomVariable):
 
     @classmethod
     def rng_fn(cls, rng, nu, sigma, size=None) -> np.ndarray:
-        return np.asarray(np.abs(stats.t.rvs(nu, sigma, size=size, random_state=rng)))
+        return np.asarray(np.abs(stats.t.rvs(nu, scale=sigma, size=size, random_state=rng)))
 
 
 halfstudentt = HalfStudentTRV()
@@ -2520,29 +2570,29 @@ class HalfStudentT(PositiveContinuous):
 
     @classmethod
     def dist(cls, nu, sigma=None, lam=None, *args, **kwargs):
-        nu = at.as_tensor_variable(floatX(nu))
+        nu = pt.as_tensor_variable(floatX(nu))
         lam, sigma = get_tau_sigma(lam, sigma)
-        sigma = at.as_tensor_variable(sigma)
+        sigma = pt.as_tensor_variable(sigma)
 
         return super().dist([nu, sigma], *args, **kwargs)
 
     def moment(rv, size, nu, sigma):
-        sigma, _ = at.broadcast_arrays(sigma, nu)
+        sigma, _ = pt.broadcast_arrays(sigma, nu)
         if not rv_size_is_none(size):
-            sigma = at.full(size, sigma)
+            sigma = pt.full(size, sigma)
         return sigma
 
     def logp(value, nu, sigma):
         res = (
-            at.log(2)
+            pt.log(2)
             + gammaln((nu + 1.0) / 2.0)
             - gammaln(nu / 2.0)
-            - 0.5 * at.log(nu * np.pi * sigma**2)
-            - (nu + 1.0) / 2.0 * at.log1p(value**2 / (nu * sigma**2))
+            - 0.5 * pt.log(nu * np.pi * sigma**2)
+            - (nu + 1.0) / 2.0 * pt.log1p(value**2 / (nu * sigma**2))
         )
 
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
             res,
         )
@@ -2640,25 +2690,25 @@ class ExGaussian(Continuous):
 
     @classmethod
     def dist(cls, mu=0.0, sigma=None, nu=None, *args, **kwargs):
-        mu = at.as_tensor_variable(floatX(mu))
-        sigma = at.as_tensor_variable(floatX(sigma))
-        nu = at.as_tensor_variable(floatX(nu))
+        mu = pt.as_tensor_variable(floatX(mu))
+        sigma = pt.as_tensor_variable(floatX(sigma))
+        nu = pt.as_tensor_variable(floatX(nu))
 
         return super().dist([mu, sigma, nu], *args, **kwargs)
 
     def moment(rv, size, mu, sigma, nu):
-        mu, nu, _ = at.broadcast_arrays(mu, nu, sigma)
+        mu, nu, _ = pt.broadcast_arrays(mu, nu, sigma)
         moment = mu + nu
         if not rv_size_is_none(size):
-            moment = at.full(size, moment)
+            moment = pt.full(size, moment)
         return moment
 
     def logp(value, mu, sigma, nu):
         # Alogithm is adapted from dexGAUS.R from gamlss
-        res = at.switch(
-            at.gt(nu, 0.05 * sigma),
+        res = pt.switch(
+            pt.gt(nu, 0.05 * sigma),
             (
-                -at.log(nu)
+                -pt.log(nu)
                 + (mu - value) / nu
                 + 0.5 * (sigma / nu) ** 2
                 + normal_lcdf(mu + (sigma**2) / nu, sigma, value)
@@ -2674,8 +2724,8 @@ def logp(value, mu, sigma, nu):
 
     def logcdf(value, mu, sigma, nu):
         # Alogithm is adapted from pexGAUS.R from gamlss
-        res = at.switch(
-            at.gt(nu, 0.05 * sigma),
+        res = pt.switch(
+            pt.gt(nu, 0.05 * sigma),
             logdiffexp(
                 normal_lcdf(mu, sigma, value),
                 (
@@ -2745,19 +2795,19 @@ class VonMises(CircularContinuous):
 
     @classmethod
     def dist(cls, mu=0.0, kappa=1.0, *args, **kwargs):
-        mu = at.as_tensor_variable(floatX(mu))
-        kappa = at.as_tensor_variable(floatX(kappa))
+        mu = pt.as_tensor_variable(floatX(mu))
+        kappa = pt.as_tensor_variable(floatX(kappa))
         return super().dist([mu, kappa], *args, **kwargs)
 
     def moment(rv, size, mu, kappa):
-        mu, _ = at.broadcast_arrays(mu, kappa)
+        mu, _ = pt.broadcast_arrays(mu, kappa)
         if not rv_size_is_none(size):
-            mu = at.full(size, mu)
+            mu = pt.full(size, mu)
         return mu
 
     def logp(value, mu, kappa):
-        res = kappa * at.cos(mu - value) - at.log(2 * np.pi) - at.log(at.i0(kappa))
-        res = at.switch(at.bitwise_and(at.ge(value, -np.pi), at.le(value, np.pi)), res, -np.inf)
+        res = kappa * pt.cos(mu - value) - pt.log(2 * np.pi) - pt.log(pt.i0(kappa))
+        res = pt.switch(pt.bitwise_and(pt.ge(value, -np.pi), pt.le(value, np.pi)), res, -np.inf)
         return check_parameters(
             res,
             kappa > 0,
@@ -2848,25 +2898,25 @@ class SkewNormal(Continuous):
     @classmethod
     def dist(cls, alpha=1, mu=0.0, sigma=None, tau=None, *args, **kwargs):
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        alpha = at.as_tensor_variable(floatX(alpha))
-        mu = at.as_tensor_variable(floatX(mu))
-        tau = at.as_tensor_variable(tau)
-        sigma = at.as_tensor_variable(sigma)
+        alpha = pt.as_tensor_variable(floatX(alpha))
+        mu = pt.as_tensor_variable(floatX(mu))
+        tau = pt.as_tensor_variable(tau)
+        sigma = pt.as_tensor_variable(sigma)
 
         return super().dist([mu, sigma, alpha], *args, **kwargs)
 
     def moment(rv, size, mu, sigma, alpha):
         mean = mu + sigma * (2 / np.pi) ** 0.5 * alpha / (1 + alpha**2) ** 0.5
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, mu, sigma, alpha):
         tau, _ = get_tau_sigma(sigma=sigma)
 
         res = (
-            at.log(1 + at.erf(((value - mu) * at.sqrt(tau) * alpha) / at.sqrt(2)))
-            + (-tau * (value - mu) ** 2 + at.log(tau / np.pi / 2.0)) / 2.0
+            pt.log(1 + pt.erf(((value - mu) * pt.sqrt(tau) * alpha) / pt.sqrt(2)))
+            + (-tau * (value - mu) ** 2 + pt.log(tau / np.pi / 2.0)) / 2.0
         )
 
         return check_parameters(
@@ -2937,25 +2987,25 @@ class Triangular(BoundedContinuous):
 
     @classmethod
     def dist(cls, lower=0, upper=1, c=0.5, *args, **kwargs):
-        lower = at.as_tensor_variable(floatX(lower))
-        upper = at.as_tensor_variable(floatX(upper))
-        c = at.as_tensor_variable(floatX(c))
+        lower = pt.as_tensor_variable(floatX(lower))
+        upper = pt.as_tensor_variable(floatX(upper))
+        c = pt.as_tensor_variable(floatX(c))
 
         return super().dist([lower, c, upper], *args, **kwargs)
 
     def moment(rv, size, lower, c, upper):
         mean = (lower + upper + c) / 3
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, lower, c, upper):
-        res = at.switch(
-            at.lt(value, c),
-            at.log(2 * (value - lower) / ((upper - lower) * (c - lower))),
-            at.log(2 * (upper - value) / ((upper - lower) * (upper - c))),
+        res = pt.switch(
+            pt.lt(value, c),
+            pt.log(2 * (value - lower) / ((upper - lower) * (c - lower))),
+            pt.log(2 * (upper - value) / ((upper - lower) * (upper - c))),
         )
-        res = at.switch(at.bitwise_and(at.le(lower, value), at.le(value, upper)), res, -np.inf)
+        res = pt.switch(pt.bitwise_and(pt.le(lower, value), pt.le(value, upper)), res, -np.inf)
         return check_parameters(
             res,
             lower <= c,
@@ -2964,15 +3014,15 @@ def logp(value, lower, c, upper):
         )
 
     def logcdf(value, lower, c, upper):
-        res = at.switch(
-            at.le(value, lower),
+        res = pt.switch(
+            pt.le(value, lower),
             -np.inf,
-            at.switch(
-                at.le(value, c),
-                at.log(((value - lower) ** 2) / ((upper - lower) * (c - lower))),
-                at.switch(
-                    at.lt(value, upper),
-                    at.log1p(-((upper - value) ** 2) / ((upper - lower) * (upper - c))),
+            pt.switch(
+                pt.le(value, c),
+                pt.log(((value - lower) ** 2) / ((upper - lower) * (c - lower))),
+                pt.switch(
+                    pt.lt(value, upper),
+                    pt.log1p(-((upper - value) ** 2) / ((upper - lower) * (upper - c))),
                     0,
                 ),
             ),
@@ -3044,20 +3094,20 @@ class Gumbel(Continuous):
 
     @classmethod
     def dist(cls, mu, beta, **kwargs):
-        mu = at.as_tensor_variable(floatX(mu))
-        beta = at.as_tensor_variable(floatX(beta))
+        mu = pt.as_tensor_variable(floatX(mu))
+        beta = pt.as_tensor_variable(floatX(beta))
 
         return super().dist([mu, beta], **kwargs)
 
     def moment(rv, size, mu, beta):
         mean = mu + beta * np.euler_gamma
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, mu, beta):
         z = (value - mu) / beta
-        res = -z - at.exp(-z) - at.log(beta)
+        res = -z - pt.exp(-z) - pt.log(beta)
         return check_parameters(
             res,
             beta > 0,
@@ -3065,7 +3115,7 @@ def logp(value, mu, beta):
         )
 
     def logcdf(value, mu, beta):
-        res = -at.exp(-(value - mu) / beta)
+        res = -pt.exp(-(value - mu) / beta)
 
         return check_parameters(
             res,
@@ -3154,8 +3204,8 @@ class Rice(PositiveContinuous):
     @classmethod
     def dist(cls, nu=None, sigma=None, b=None, *args, **kwargs):
         nu, b, sigma = cls.get_nu_b(nu, b, sigma)
-        b = at.as_tensor_variable(floatX(b))
-        sigma = at.as_tensor_variable(floatX(sigma))
+        b = pt.as_tensor_variable(floatX(b))
+        sigma = pt.as_tensor_variable(floatX(sigma))
 
         return super().dist([b, sigma], *args, **kwargs)
 
@@ -3176,24 +3226,24 @@ def moment(rv, size, nu, sigma):
         mean = (
             sigma
             * np.sqrt(np.pi / 2)
-            * at.exp(nu_sigma_ratio / 2)
+            * pt.exp(nu_sigma_ratio / 2)
             * (
-                (1 - nu_sigma_ratio) * at.i0(-nu_sigma_ratio / 2)
-                - nu_sigma_ratio * at.i1(-nu_sigma_ratio / 2)
+                (1 - nu_sigma_ratio) * pt.i0(-nu_sigma_ratio / 2)
+                - nu_sigma_ratio * pt.i1(-nu_sigma_ratio / 2)
             )
         )
 
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, b, sigma):
         x = value / sigma
 
-        res = at.switch(
-            at.le(value, 0),
+        res = pt.switch(
+            pt.le(value, 0),
             -np.inf,
-            at.log(x * at.exp((-(x - b) * (x - b)) / 2) * i0e(x * b) / sigma),
+            pt.log(x * pt.exp((-(x - b) * (x - b)) / 2) * i0e(x * b) / sigma),
         )
 
         return check_parameters(
@@ -3253,19 +3303,19 @@ class Logistic(Continuous):
 
     @classmethod
     def dist(cls, mu=0.0, s=1.0, *args, **kwargs):
-        mu = at.as_tensor_variable(floatX(mu))
-        s = at.as_tensor_variable(floatX(s))
+        mu = pt.as_tensor_variable(floatX(mu))
+        s = pt.as_tensor_variable(floatX(s))
         return super().dist([mu, s], *args, **kwargs)
 
     def moment(rv, size, mu, s):
-        mu, _ = at.broadcast_arrays(mu, s)
+        mu, _ = pt.broadcast_arrays(mu, s)
         if not rv_size_is_none(size):
-            mu = at.full(size, mu)
+            mu = pt.full(size, mu)
         return mu
 
     def logp(value, mu, s):
         z = (value - mu) / s
-        res = -z - at.log(s) - 2.0 * at.log1p(at.exp(-z))
+        res = -z - pt.log(s) - 2.0 * pt.log1p(pt.exp(-z))
         return check_parameters(
             res,
             s > 0,
@@ -3273,7 +3323,7 @@ def logp(value, mu, s):
         )
 
     def logcdf(value, mu, s):
-        res = -at.log1pexp(-(value - mu) / s)
+        res = -pt.log1pexp(-(value - mu) / s)
 
         return check_parameters(
             res,
@@ -3348,29 +3398,29 @@ class LogitNormal(UnitContinuous):
 
     @classmethod
     def dist(cls, mu=0, sigma=None, tau=None, **kwargs):
-        mu = at.as_tensor_variable(floatX(mu))
+        mu = pt.as_tensor_variable(floatX(mu))
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        sigma = at.as_tensor_variable(sigma)
-        tau = at.as_tensor_variable(tau)
+        sigma = pt.as_tensor_variable(sigma)
+        tau = pt.as_tensor_variable(tau)
 
         return super().dist([mu, sigma], **kwargs)
 
     def moment(rv, size, mu, sigma):
-        median, _ = at.broadcast_arrays(invlogit(mu), sigma)
+        median, _ = pt.broadcast_arrays(invlogit(mu), sigma)
         if not rv_size_is_none(size):
-            median = at.full(size, median)
+            median = pt.full(size, median)
         return median
 
     def logp(value, mu, sigma):
         tau, _ = get_tau_sigma(sigma=sigma)
 
-        res = at.switch(
-            at.or_(at.le(value, 0), at.ge(value, 1)),
+        res = pt.switch(
+            pt.or_(pt.le(value, 0), pt.ge(value, 1)),
             -np.inf,
             (
                 -0.5 * tau * (logit(value) - mu) ** 2
-                + 0.5 * at.log(tau / (2.0 * np.pi))
-                - at.log(value * (1 - value))
+                + 0.5 * pt.log(tau / (2.0 * np.pi))
+                - pt.log(value * (1 - value))
             ),
         )
 
@@ -3479,12 +3529,12 @@ def dist(cls, x_points, pdf_points, *args, **kwargs):
         cdf_points = interp.antiderivative()(x_points) / Z
         pdf_points = pdf_points / Z
 
-        x_points = at.constant(floatX(x_points))
-        pdf_points = at.constant(floatX(pdf_points))
-        cdf_points = at.constant(floatX(cdf_points))
+        x_points = pt.constant(floatX(x_points))
+        pdf_points = pt.constant(floatX(pdf_points))
+        cdf_points = pt.constant(floatX(cdf_points))
 
-        # lower = at.as_tensor_variable(x_points[0])
-        # upper = at.as_tensor_variable(x_points[-1])
+        # lower = pt.as_tensor_variable(x_points[0])
+        # upper = pt.as_tensor_variable(x_points[-1])
         # median = _interpolated_argcdf(0.5, pdf_points, cdf_points, x_points)
 
         return super().dist([x_points, pdf_points, cdf_points], **kwargs)
@@ -3493,11 +3543,11 @@ def moment(rv, size, x_points, pdf_points, cdf_points):
         """
         Estimates the expectation integral using the trapezoid rule; cdf_points are not used.
         """
-        x_fx = at.mul(x_points, pdf_points)  # x_i * f(x_i) for all xi's in x_points
-        moment = at.sum(at.mul(at.diff(x_points), x_fx[1:] + x_fx[:-1])) / 2
+        x_fx = pt.mul(x_points, pdf_points)  # x_i * f(x_i) for all xi's in x_points
+        moment = pt.sum(pt.mul(pt.diff(x_points), x_fx[1:] + x_fx[:-1])) / 2
 
         if not rv_size_is_none(size):
-            moment = at.full(size, moment)
+            moment = pt.full(size, moment)
 
         return moment
 
@@ -3509,9 +3559,9 @@ def logp(value, x_points, pdf_points, cdf_points):
 
         # interp and Z are converted to symbolic variables here
         interp_op = SplineWrapper(interp)
-        Z = at.constant(Z)
+        Z = pt.constant(Z)
 
-        return at.log(interp_op(value) / Z)
+        return pt.log(interp_op(value) / Z)
 
 
 @_default_transform.register(Interpolated)
@@ -3590,21 +3640,21 @@ class Moyal(Continuous):
 
     @classmethod
     def dist(cls, mu=0, sigma=1.0, *args, **kwargs):
-        mu = at.as_tensor_variable(floatX(mu))
-        sigma = at.as_tensor_variable(floatX(sigma))
+        mu = pt.as_tensor_variable(floatX(mu))
+        sigma = pt.as_tensor_variable(floatX(sigma))
 
         return super().dist([mu, sigma], *args, **kwargs)
 
     def moment(rv, size, mu, sigma):
-        mean = mu + sigma * (np.euler_gamma + at.log(2))
+        mean = mu + sigma * (np.euler_gamma + pt.log(2))
 
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, mu, sigma):
         scaled = (value - mu) / sigma
-        res = -(1 / 2) * (scaled + at.exp(-scaled)) - at.log(sigma) - (1 / 2) * at.log(2 * np.pi)
+        res = -(1 / 2) * (scaled + pt.exp(-scaled)) - pt.log(sigma) - (1 / 2) * pt.log(2 * np.pi)
         return check_parameters(
             res,
             sigma > 0,
@@ -3613,7 +3663,7 @@ def logp(value, mu, sigma):
 
     def logcdf(value, mu, sigma):
         scaled = (value - mu) / sigma
-        res = at.log(at.erfc(at.exp(-scaled / 2) * (2**-0.5)))
+        res = pt.log(pt.erfc(pt.exp(-scaled / 2) * (2**-0.5)))
         return check_parameters(
             res,
             sigma > 0,
@@ -3677,12 +3727,12 @@ def __init__(self, get_pdf=False):
         self.get_pdf = get_pdf
 
     def make_node(self, x, h, z):
-        x = at.as_tensor_variable(floatX(x))
-        h = at.as_tensor_variable(floatX(h))
-        z = at.as_tensor_variable(floatX(z))
+        x = pt.as_tensor_variable(floatX(x))
+        h = pt.as_tensor_variable(floatX(h))
+        z = pt.as_tensor_variable(floatX(z))
         bshape = broadcast_shape(x, h, z)
         shape = [None] * len(bshape)
-        return Apply(self, [x, h, z], [at.TensorType(pytensor.config.floatX, shape)()])
+        return Apply(self, [x, h, z], [pt.TensorType(pytensor.config.floatX, shape)()])
 
     def perform(self, node, ins, outs):
         x, h, z = ins[0], ins[1], ins[2]
@@ -3778,24 +3828,24 @@ class PolyaGamma(PositiveContinuous):
 
     @classmethod
     def dist(cls, h=1.0, z=0.0, **kwargs):
-        h = at.as_tensor_variable(floatX(h))
-        z = at.as_tensor_variable(floatX(z))
+        h = pt.as_tensor_variable(floatX(h))
+        z = pt.as_tensor_variable(floatX(z))
 
         msg = f"The variable {h} specified for PolyaGamma has non-positive "
         msg += "values, making it unsuitable for this parameter."
-        Assert(msg)(h, at.all(at.gt(h, 0.0)))
+        Assert(msg)(h, pt.all(pt.gt(h, 0.0)))
 
         return super().dist([h, z], **kwargs)
 
     def moment(rv, size, h, z):
-        mean = at.switch(at.eq(z, 0), h / 4, tanh(z / 2) * (h / (2 * z)))
+        mean = pt.switch(pt.eq(z, 0), h / 4, tanh(z / 2) * (h / (2 * z)))
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, h, z):
-        res = at.switch(
-            at.le(value, 0),
+        res = pt.switch(
+            pt.le(value, 0),
             -np.inf,
             _PolyaGammaLogDistFunc(get_pdf=True)(value, h, z),
         )
@@ -3806,8 +3856,8 @@ def logp(value, h, z):
         )
 
     def logcdf(value, h, z):
-        res = at.switch(
-            at.le(value, 0),
+        res = pt.switch(
+            pt.le(value, 0),
             -np.inf,
             _PolyaGammaLogDistFunc(get_pdf=False)(value, h, z),
         )
diff --git a/pymc/distributions/discrete.py b/pymc/distributions/discrete.py
index 08522cb982..8e52c812d1 100644
--- a/pymc/distributions/discrete.py
+++ b/pymc/distributions/discrete.py
@@ -14,7 +14,7 @@
 import warnings
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.tensor import TensorConstant
 from pytensor.tensor.random.basic import (
@@ -36,6 +36,8 @@
 from pymc.distributions.dist_math import (
     betaln,
     binomln,
+    check_icdf_parameters,
+    check_icdf_value,
     check_parameters,
     factln,
     log_diff_normal_cdf,
@@ -46,7 +48,7 @@
 from pymc.distributions.distribution import Discrete
 from pymc.distributions.mixture import Mixture
 from pymc.distributions.shape_utils import rv_size_is_none
-from pymc.logprob.joint_logprob import logp
+from pymc.logprob.basic import logp
 from pymc.math import sigmoid
 from pymc.pytensorf import floatX, intX
 from pymc.vartypes import continuous_types
@@ -127,21 +129,21 @@ def dist(cls, n, p=None, logit_p=None, *args, **kwargs):
             raise ValueError("Incompatible parametrization. Must specify either p or logit_p.")
 
         if logit_p is not None:
-            p = at.sigmoid(logit_p)
+            p = pt.sigmoid(logit_p)
 
-        n = at.as_tensor_variable(intX(n))
-        p = at.as_tensor_variable(floatX(p))
+        n = pt.as_tensor_variable(intX(n))
+        p = pt.as_tensor_variable(floatX(p))
         return super().dist([n, p], **kwargs)
 
     def moment(rv, size, n, p):
-        mean = at.round(n * p)
+        mean = pt.round(n * p)
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, n, p):
-        res = at.switch(
-            at.or_(at.lt(value, 0), at.gt(value, n)),
+        res = pt.switch(
+            pt.or_(pt.lt(value, 0), pt.gt(value, n)),
             -np.inf,
             binomln(n, value) + logpow(p, value) + logpow(1 - p, n - value),
         )
@@ -155,14 +157,14 @@ def logp(value, n, p):
         )
 
     def logcdf(value, n, p):
-        value = at.floor(value)
+        value = pt.floor(value)
 
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.switch(
-                at.lt(value, n),
-                at.log(at.betainc(n - value, value + 1, 1 - p)),
+            pt.switch(
+                pt.lt(value, n),
+                pt.log(pt.betainc(n - value, value + 1, 1 - p)),
                 0,
             ),
         )
@@ -237,20 +239,20 @@ def BetaBinom(a, b, n, x):
 
     @classmethod
     def dist(cls, alpha, beta, n, *args, **kwargs):
-        alpha = at.as_tensor_variable(floatX(alpha))
-        beta = at.as_tensor_variable(floatX(beta))
-        n = at.as_tensor_variable(intX(n))
+        alpha = pt.as_tensor_variable(floatX(alpha))
+        beta = pt.as_tensor_variable(floatX(beta))
+        n = pt.as_tensor_variable(intX(n))
         return super().dist([n, alpha, beta], **kwargs)
 
     def moment(rv, size, n, alpha, beta):
-        mean = at.round((n * alpha) / (alpha + beta))
+        mean = pt.round((n * alpha) / (alpha + beta))
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, n, alpha, beta):
-        res = at.switch(
-            at.or_(at.lt(value, 0), at.gt(value, n)),
+        res = pt.switch(
+            pt.or_(pt.lt(value, 0), pt.gt(value, n)),
             -np.inf,
             binomln(n, value) + betaln(value + alpha, n - value + beta) - betaln(alpha, beta),
         )
@@ -269,16 +271,16 @@ def logcdf(value, n, alpha, beta):
                 f"BetaBinomial.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
             )
 
-        safe_lower = at.switch(at.lt(value, 0), value, 0)
-        res = at.switch(
-            at.lt(value, 0),
+        safe_lower = pt.switch(pt.lt(value, 0), value, 0)
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.switch(
-                at.lt(value, n),
-                at.logsumexp(
+            pt.switch(
+                pt.lt(value, n),
+                pt.logsumexp(
                     logp(
                         BetaBinomial.dist(alpha=alpha, beta=beta, n=n),
-                        at.arange(safe_lower, value + 1),
+                        pt.arange(safe_lower, value + 1),
                     ),
                     keepdims=False,
                 ),
@@ -349,21 +351,21 @@ def dist(cls, p=None, logit_p=None, *args, **kwargs):
             raise ValueError("Incompatible parametrization. Must specify either p or logit_p.")
 
         if logit_p is not None:
-            p = at.sigmoid(logit_p)
+            p = pt.sigmoid(logit_p)
 
-        p = at.as_tensor_variable(floatX(p))
+        p = pt.as_tensor_variable(floatX(p))
         return super().dist([p], **kwargs)
 
     def moment(rv, size, p):
         if not rv_size_is_none(size):
-            p = at.full(size, p)
-        return at.switch(p < 0.5, 0, 1)
+            p = pt.full(size, p)
+        return pt.switch(p < 0.5, 0, 1)
 
     def logp(value, p):
-        res = at.switch(
-            at.or_(at.lt(value, 0), at.gt(value, 1)),
+        res = pt.switch(
+            pt.or_(pt.lt(value, 0), pt.gt(value, 1)),
             -np.inf,
-            at.switch(value, at.log(p), at.log1p(-p)),
+            pt.switch(value, pt.log(p), pt.log1p(-p)),
         )
 
         return check_parameters(
@@ -374,12 +376,12 @@ def logp(value, p):
         )
 
     def logcdf(value, p):
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.switch(
-                at.lt(value, 1),
-                at.log1p(-p),
+            pt.switch(
+                pt.lt(value, 1),
+                pt.log1p(-p),
                 0,
             ),
         )
@@ -459,21 +461,21 @@ def DiscreteWeibull(q, b, x):
 
     @classmethod
     def dist(cls, q, beta, *args, **kwargs):
-        q = at.as_tensor_variable(floatX(q))
-        beta = at.as_tensor_variable(floatX(beta))
+        q = pt.as_tensor_variable(floatX(q))
+        beta = pt.as_tensor_variable(floatX(beta))
         return super().dist([q, beta], **kwargs)
 
     def moment(rv, size, q, beta):
-        median = at.power(at.log(0.5) / at.log(q), 1 / beta) - 1
+        median = pt.power(pt.log(0.5) / pt.log(q), 1 / beta) - 1
         if not rv_size_is_none(size):
-            median = at.full(size, median)
+            median = pt.full(size, median)
         return median
 
     def logp(value, q, beta):
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.log(at.power(q, at.power(value, beta)) - at.power(q, at.power(value + 1, beta))),
+            pt.log(pt.power(q, pt.power(value, beta)) - pt.power(q, pt.power(value + 1, beta))),
         )
 
         return check_parameters(
@@ -485,10 +487,10 @@ def logp(value, q, beta):
         )
 
     def logcdf(value, q, beta):
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.log1p(-at.power(q, at.power(value + 1, beta))),
+            pt.log1p(-pt.power(q, pt.power(value + 1, beta))),
         )
         return check_parameters(
             res,
@@ -548,24 +550,24 @@ class Poisson(Discrete):
 
     @classmethod
     def dist(cls, mu, *args, **kwargs):
-        mu = at.as_tensor_variable(floatX(mu))
+        mu = pt.as_tensor_variable(floatX(mu))
         return super().dist([mu], *args, **kwargs)
 
     def moment(rv, size, mu):
-        mu = at.floor(mu)
+        mu = pt.floor(mu)
         if not rv_size_is_none(size):
-            mu = at.full(size, mu)
+            mu = pt.full(size, mu)
         return mu
 
     def logp(value, mu):
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
             logpow(mu, value) - factln(value) - mu,
         )
         # Return zero when mu and value are both zero
-        res = at.switch(
-            at.eq(mu, 0) * at.eq(value, 0),
+        res = pt.switch(
+            pt.eq(mu, 0) * pt.eq(value, 0),
             0,
             res,
         )
@@ -576,15 +578,15 @@ def logp(value, mu):
         )
 
     def logcdf(value, mu):
-        value = at.floor(value)
+        value = pt.floor(value)
         # Avoid C-assertion when the gammaincc function is called with invalid values (#4340)
-        safe_mu = at.switch(at.lt(mu, 0), 0, mu)
-        safe_value = at.switch(at.lt(value, 0), 0, value)
+        safe_mu = pt.switch(pt.lt(mu, 0), 0, mu)
+        safe_value = pt.switch(pt.lt(value, 0), 0, value)
 
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.log(at.gammaincc(safe_value + 1, safe_mu)),
+            pt.log(pt.gammaincc(safe_value + 1, safe_mu)),
         )
 
         return check_parameters(
@@ -671,8 +673,8 @@ def NegBinom(a, m, x):
     @classmethod
     def dist(cls, mu=None, alpha=None, p=None, n=None, *args, **kwargs):
         n, p = cls.get_n_p(mu=mu, alpha=alpha, p=p, n=n)
-        n = at.as_tensor_variable(floatX(n))
-        p = at.as_tensor_variable(floatX(p))
+        n = pt.as_tensor_variable(floatX(n))
+        p = pt.as_tensor_variable(floatX(p))
         return super().dist([n, p], *args, **kwargs)
 
     @classmethod
@@ -696,17 +698,17 @@ def get_n_p(cls, mu=None, alpha=None, p=None, n=None):
         return n, p
 
     def moment(rv, size, n, p):
-        mu = at.floor(n * (1 - p) / p)
+        mu = pt.floor(n * (1 - p) / p)
         if not rv_size_is_none(size):
-            mu = at.full(size, mu)
+            mu = pt.full(size, mu)
         return mu
 
     def logp(value, n, p):
         alpha = n
         mu = alpha * (1 - p) / p
 
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
             (
                 binomln(value + alpha - 1, value)
@@ -723,13 +725,13 @@ def logp(value, n, p):
         )
 
         # Return Poisson when alpha gets very large.
-        return at.switch(at.gt(alpha, 1e10), logp(Poisson.dist(mu=mu), value), negbinom)
+        return pt.switch(pt.gt(alpha, 1e10), logp(Poisson.dist(mu=mu), value), negbinom)
 
     def logcdf(value, n, p):
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.log(at.betainc(n, at.floor(value) + 1, p)),
+            pt.log(pt.betainc(n, pt.floor(value) + 1, p)),
         )
         return check_parameters(
             res,
@@ -783,20 +785,20 @@ class Geometric(Discrete):
 
     @classmethod
     def dist(cls, p, *args, **kwargs):
-        p = at.as_tensor_variable(floatX(p))
+        p = pt.as_tensor_variable(floatX(p))
         return super().dist([p], *args, **kwargs)
 
     def moment(rv, size, p):
-        mean = at.round(1.0 / p)
+        mean = pt.round(1.0 / p)
         if not rv_size_is_none(size):
-            mean = at.full(size, mean)
+            mean = pt.full(size, mean)
         return mean
 
     def logp(value, p):
-        res = at.switch(
-            at.lt(value, 1),
+        res = pt.switch(
+            pt.lt(value, 1),
             -np.inf,
-            at.log(p) + logpow(1 - p, value - 1),
+            pt.log(p) + logpow(1 - p, value - 1),
         )
 
         return check_parameters(
@@ -807,10 +809,10 @@ def logp(value, p):
         )
 
     def logcdf(value, p):
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.log1mexp(at.log1p(-p) * value),
+            pt.log1mexp(pt.log1p(-p) * value),
         )
         return check_parameters(
             res,
@@ -820,7 +822,14 @@ def logcdf(value, p):
         )
 
     def icdf(value, p):
-        return at.ceil(at.log1p(-value) / at.log1p(-p)).astype("int64")
+        res = pt.ceil(pt.log1p(-value) / pt.log1p(-p)).astype("int64")
+        res = check_icdf_value(res, value)
+        return check_icdf_parameters(
+            res,
+            0 <= p,
+            p <= 1,
+            msg="0 <= p <= 1",
+        )
 
 
 class HyperGeometric(Discrete):
@@ -873,16 +882,16 @@ class HyperGeometric(Discrete):
 
     @classmethod
     def dist(cls, N, k, n, *args, **kwargs):
-        good = at.as_tensor_variable(intX(k))
-        bad = at.as_tensor_variable(intX(N - k))
-        n = at.as_tensor_variable(intX(n))
+        good = pt.as_tensor_variable(intX(k))
+        bad = pt.as_tensor_variable(intX(N - k))
+        n = pt.as_tensor_variable(intX(n))
         return super().dist([good, bad, n], *args, **kwargs)
 
     def moment(rv, size, good, bad, n):
         N, k = good + bad, good
-        mode = at.floor((n + 1) * (k + 1) / (N + 2))
+        mode = pt.floor((n + 1) * (k + 1) / (N + 2))
         if not rv_size_is_none(size):
-            mode = at.full(size, mode)
+            mode = pt.full(size, mode)
         return mode
 
     def logp(value, good, bad, n):
@@ -896,14 +905,14 @@ def logp(value, good, bad, n):
             - betaln(tot + 1, 1)
         )
         # value in [max(0, n - N + k), min(k, n)]
-        lower = at.switch(at.gt(n - tot + good, 0), n - tot + good, 0)
-        upper = at.switch(at.lt(good, n), good, n)
+        lower = pt.switch(pt.gt(n - tot + good, 0), n - tot + good, 0)
+        upper = pt.switch(pt.lt(good, n), good, n)
 
-        res = at.switch(
-            at.lt(value, lower),
+        res = pt.switch(
+            pt.lt(value, lower),
             -np.inf,
-            at.switch(
-                at.le(value, upper),
+            pt.switch(
+                pt.le(value, upper),
                 result,
                 -np.inf,
             ),
@@ -924,15 +933,15 @@ def logcdf(value, good, bad, n):
 
         N = good + bad
         # TODO: Use lower upper in locgdf for smarter logsumexp?
-        safe_lower = at.switch(at.lt(value, 0), value, 0)
+        safe_lower = pt.switch(pt.lt(value, 0), value, 0)
 
-        res = at.switch(
-            at.lt(value, 0),
+        res = pt.switch(
+            pt.lt(value, 0),
             -np.inf,
-            at.switch(
-                at.lt(value, n),
-                at.logsumexp(
-                    HyperGeometric.logp(at.arange(safe_lower, value + 1), good, bad, n),
+            pt.switch(
+                pt.lt(value, n),
+                pt.logsumexp(
+                    HyperGeometric.logp(pt.arange(safe_lower, value + 1), good, bad, n),
                     keepdims=False,
                 ),
                 0,
@@ -1010,21 +1019,21 @@ class DiscreteUniform(Discrete):
 
     @classmethod
     def dist(cls, lower, upper, *args, **kwargs):
-        lower = intX(at.floor(lower))
-        upper = intX(at.floor(upper))
+        lower = intX(pt.floor(lower))
+        upper = intX(pt.floor(upper))
         return super().dist([lower, upper], **kwargs)
 
     def moment(rv, size, lower, upper):
-        mode = at.maximum(at.floor((upper + lower) / 2.0), lower)
+        mode = pt.maximum(pt.floor((upper + lower) / 2.0), lower)
         if not rv_size_is_none(size):
-            mode = at.full(size, mode)
+            mode = pt.full(size, mode)
         return mode
 
     def logp(value, lower, upper):
-        res = at.switch(
-            at.or_(at.lt(value, lower), at.gt(value, upper)),
+        res = pt.switch(
+            pt.or_(pt.lt(value, lower), pt.gt(value, upper)),
             -np.inf,
-            at.fill(value, -at.log(upper - lower + 1)),
+            pt.fill(value, -pt.log(upper - lower + 1)),
         )
         return check_parameters(
             res,
@@ -1033,12 +1042,12 @@ def logp(value, lower, upper):
         )
 
     def logcdf(value, lower, upper):
-        res = at.switch(
-            at.le(value, lower),
+        res = pt.switch(
+            pt.le(value, lower),
             -np.inf,
-            at.switch(
-                at.lt(value, upper),
-                at.log(at.minimum(at.floor(value), upper) - lower + 1) - at.log(upper - lower + 1),
+            pt.switch(
+                pt.lt(value, upper),
+                pt.log(pt.minimum(pt.floor(value), upper) - lower + 1) - pt.log(upper - lower + 1),
                 0,
             ),
         )
@@ -1049,6 +1058,15 @@ def logcdf(value, lower, upper):
             msg="lower <= upper",
         )
 
+    def icdf(value, lower, upper):
+        res = pt.ceil(value * (upper - lower + 1)).astype("int64") + lower - 1
+        res = check_icdf_value(res, value)
+        return check_icdf_parameters(
+            res,
+            lower <= upper,
+            msg="lower <= upper",
+        )
+
 
 class Categorical(Discrete):
     R"""
@@ -1099,7 +1117,7 @@ def dist(cls, p=None, logit_p=None, **kwargs):
         if logit_p is not None:
             p = pm.math.softmax(logit_p, axis=-1)
 
-        p = at.as_tensor_variable(p)
+        p = pt.as_tensor_variable(p)
         if isinstance(p, TensorConstant):
             p_ = np.asarray(p.data)
             if np.any(p_ < 0):
@@ -1112,38 +1130,38 @@ def dist(cls, p=None, logit_p=None, **kwargs):
                     "You can rescale them directly to get rid of this warning.",
                     UserWarning,
                 )
-                p_ = p_ / at.sum(p_, axis=-1, keepdims=True)
-                p = at.as_tensor_variable(p_)
+                p_ = p_ / pt.sum(p_, axis=-1, keepdims=True)
+                p = pt.as_tensor_variable(p_)
         return super().dist([p], **kwargs)
 
     def moment(rv, size, p):
-        mode = at.argmax(p, axis=-1)
+        mode = pt.argmax(p, axis=-1)
         if not rv_size_is_none(size):
-            mode = at.full(size, mode)
+            mode = pt.full(size, mode)
         return mode
 
     def logp(value, p):
-        k = at.shape(p)[-1]
+        k = pt.shape(p)[-1]
         p_ = p
-        value_clip = at.clip(value, 0, k - 1)
+        value_clip = pt.clip(value, 0, k - 1)
 
         if p.ndim > 1:
             if p.ndim > value_clip.ndim:
-                value_clip = at.shape_padleft(value_clip, p_.ndim - value_clip.ndim)
+                value_clip = pt.shape_padleft(value_clip, p_.ndim - value_clip.ndim)
             elif p.ndim < value_clip.ndim:
-                p = at.shape_padleft(p, value_clip.ndim - p_.ndim)
+                p = pt.shape_padleft(p, value_clip.ndim - p_.ndim)
             pattern = (p.ndim - 1,) + tuple(range(p.ndim - 1))
-            a = at.log(
-                at.take_along_axis(
+            a = pt.log(
+                pt.take_along_axis(
                     p.dimshuffle(pattern),
                     value_clip,
                 )
             )
         else:
-            a = at.log(p[value_clip])
+            a = pt.log(p[value_clip])
 
-        res = at.switch(
-            at.or_(at.lt(value, 0), at.gt(value, k - 1)),
+        res = pt.switch(
+            pt.or_(pt.lt(value, 0), pt.gt(value, k - 1)),
             -np.inf,
             a,
         )
@@ -1152,7 +1170,7 @@ def logp(value, p):
             res,
             0 <= p_,
             p_ <= 1,
-            at.isclose(at.sum(p, axis=-1), 1),
+            pt.isclose(pt.sum(p, axis=-1), 1),
             msg="0 <= p <=1, sum(p) = 1",
         )
 
@@ -1164,7 +1182,7 @@ class DiracDeltaRV(RandomVariable):
     _print_name = ("DiracDelta", "\\operatorname{DiracDelta}")
 
     def make_node(self, rng, size, dtype, c):
-        c = at.as_tensor_variable(c)
+        c = pt.as_tensor_variable(c)
         return super().make_node(rng, size, c.dtype, c)
 
     @classmethod
@@ -1193,26 +1211,26 @@ class DiracDelta(Discrete):
 
     @classmethod
     def dist(cls, c, *args, **kwargs):
-        c = at.as_tensor_variable(c)
+        c = pt.as_tensor_variable(c)
         if c.dtype in continuous_types:
             c = floatX(c)
         return super().dist([c], **kwargs)
 
     def moment(rv, size, c):
         if not rv_size_is_none(size):
-            c = at.full(size, c)
+            c = pt.full(size, c)
         return c
 
     def logp(value, c):
-        return at.switch(
-            at.eq(value, c),
-            at.zeros_like(value),
+        return pt.switch(
+            pt.eq(value, c),
+            pt.zeros_like(value),
             -np.inf,
         )
 
     def logcdf(value, c):
-        return at.switch(
-            at.lt(value, c),
+        return pt.switch(
+            pt.lt(value, c),
             -np.inf,
             0,
         )
@@ -1240,8 +1258,8 @@ def _zero_inflated_mixture(*, name, nonzero_p, nonzero_dist, **kwargs):
 
     If name is `None`, this function returns an unregistered variable
     """
-    nonzero_p = at.as_tensor_variable(floatX(nonzero_p))
-    weights = at.stack([1 - nonzero_p, nonzero_p], axis=-1)
+    nonzero_p = pt.as_tensor_variable(floatX(nonzero_p))
+    weights = pt.stack([1 - nonzero_p, nonzero_p], axis=-1)
     comp_dists = [
         DiracDelta.dist(0),
         nonzero_dist,
@@ -1454,7 +1472,7 @@ def ZeroInfNegBinom(a, m, psi, x):
     psi : tensor_like of float
         Expected proportion of NegativeBinomial variates (0 < psi < 1)
     mu : tensor_like of float
-        Poission distribution parameter (mu > 0).
+        Poisson distribution parameter (mu > 0).
     alpha : tensor_like of float
         Gamma distribution parameter (alpha > 0).
     p : tensor_like of float
@@ -1490,15 +1508,15 @@ class _OrderedLogistic(Categorical):
 
     @classmethod
     def dist(cls, eta, cutpoints, *args, **kwargs):
-        eta = at.as_tensor_variable(floatX(eta))
-        cutpoints = at.as_tensor_variable(cutpoints)
+        eta = pt.as_tensor_variable(floatX(eta))
+        cutpoints = pt.as_tensor_variable(cutpoints)
 
-        pa = sigmoid(cutpoints - at.shape_padright(eta))
-        p_cum = at.concatenate(
+        pa = sigmoid(cutpoints - pt.shape_padright(eta))
+        p_cum = pt.concatenate(
             [
-                at.zeros_like(at.shape_padright(pa[..., 0])),
+                pt.zeros_like(pt.shape_padright(pa[..., 0])),
                 pa,
-                at.ones_like(at.shape_padright(pa[..., 0])),
+                pt.ones_like(pt.shape_padright(pa[..., 0])),
             ],
             axis=-1,
         )
@@ -1563,7 +1581,7 @@ class OrderedLogistic:
         # Ordered logistic regression
         with pm.Model() as model:
             cutpoints = pm.Normal("cutpoints", mu=[-1,1], sigma=10, shape=2,
-                                  transform=pm.distributions.transforms.ordered)
+                                  transform=pm.distributions.transforms.univariate_ordered)
             y_ = pm.OrderedLogistic("y", cutpoints=cutpoints, eta=x, observed=y)
             idata = pm.sample()
 
@@ -1596,22 +1614,22 @@ class _OrderedProbit(Categorical):
 
     @classmethod
     def dist(cls, eta, cutpoints, sigma=1, *args, **kwargs):
-        eta = at.as_tensor_variable(floatX(eta))
-        cutpoints = at.as_tensor_variable(cutpoints)
+        eta = pt.as_tensor_variable(floatX(eta))
+        cutpoints = pt.as_tensor_variable(cutpoints)
 
-        probits = at.shape_padright(eta) - cutpoints
-        _log_p = at.concatenate(
+        probits = pt.shape_padright(eta) - cutpoints
+        _log_p = pt.concatenate(
             [
-                at.shape_padright(normal_lccdf(0, sigma, probits[..., 0])),
+                pt.shape_padright(normal_lccdf(0, sigma, probits[..., 0])),
                 log_diff_normal_cdf(
-                    0, at.shape_padright(sigma), probits[..., :-1], probits[..., 1:]
+                    0, pt.shape_padright(sigma), probits[..., :-1], probits[..., 1:]
                 ),
-                at.shape_padright(normal_lcdf(0, sigma, probits[..., -1])),
+                pt.shape_padright(normal_lcdf(0, sigma, probits[..., -1])),
             ],
             axis=-1,
         )
-        _log_p = at.as_tensor_variable(floatX(_log_p))
-        p = at.exp(_log_p)
+        _log_p = pt.as_tensor_variable(floatX(_log_p))
+        p = pt.exp(_log_p)
 
         return super().dist(p, *args, **kwargs)
 
diff --git a/pymc/distributions/dist_math.py b/pymc/distributions/dist_math.py
index fbdea97440..7a72f27ebd 100644
--- a/pymc/distributions/dist_math.py
+++ b/pymc/distributions/dist_math.py
@@ -19,11 +19,12 @@
 """
 import warnings
 
+from functools import partial
 from typing import Iterable
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import scipy.linalg
 import scipy.stats
 
@@ -50,22 +51,46 @@
 }
 
 
-def check_parameters(logp: Variable, *conditions: Iterable[Variable], msg: str = ""):
-    """
-    Wrap a log probability graph in a CheckParameterValue that asserts several
-    conditions are True. When conditions are not met a ParameterValueError assertion is
-    raised, with an optional custom message defined by `msg`
+def check_parameters(
+    expr: Variable,
+    *conditions: Iterable[Variable],
+    msg: str = "",
+    can_be_replaced_by_ninf: bool = True,
+):
+    """Wrap an expression in a CheckParameterValue that asserts several conditions are met.
+
+    When conditions are not met a ParameterValueError assertion is raised,
+    with an optional custom message defined by `msg`.
+
+    When the flag `can_be_replaced_by_ninf` is True (default), PyMC is allowed to replace the
+    assertion by a switch(condition, expr, -inf). This is used for logp graphs!
 
-    Note that check_parameter should not be used to enforce the logic of the logp
+    Note that check_parameter should not be used to enforce the logic of the
     expression under the normal parameter support as it can be disabled by the user via
     check_bounds = False in pm.Model()
     """
-    # at.all does not accept True/False, but accepts np.array(True)/np.array(False)
+    # pt.all does not accept True/False, but accepts np.array(True)/np.array(False)
     conditions_ = [
         cond if (cond is not True and cond is not False) else np.array(cond) for cond in conditions
     ]
-    all_true_scalar = at.all([at.all(cond) for cond in conditions_])
-    return CheckParameterValue(msg)(logp, all_true_scalar)
+    all_true_scalar = pt.all([pt.all(cond) for cond in conditions_])
+
+    return CheckParameterValue(msg, can_be_replaced_by_ninf)(expr, all_true_scalar)
+
+
+check_icdf_parameters = partial(check_parameters, can_be_replaced_by_ninf=False)
+
+
+def check_icdf_value(expr: Variable, value: Variable) -> Variable:
+    """Wrap icdf expression in nan switch for value."""
+    value = pt.as_tensor_variable(value)
+    expr = pt.switch(
+        pt.and_(value >= 0, value <= 1),
+        expr,
+        np.nan,
+    )
+    expr.name = "0 <= value <= 1"
+    return expr
 
 
 def logpow(x, m):
@@ -73,7 +98,7 @@ def logpow(x, m):
     Calculates log(x**m) since m*log(x) will fail when m, x = 0.
     """
     # return m * log(x)
-    return at.switch(at.eq(x, 0), at.switch(at.eq(m, 0), 0.0, -np.inf), m * at.log(x))
+    return pt.switch(pt.eq(x, 0), pt.switch(pt.eq(m, 0), 0.0, -np.inf), m * pt.log(x))
 
 
 def factln(n):
@@ -92,25 +117,25 @@ def std_cdf(x):
     """
     Calculates the standard normal cumulative distribution function.
     """
-    return 0.5 + 0.5 * at.erf(x / at.sqrt(2.0))
+    return 0.5 + 0.5 * pt.erf(x / pt.sqrt(2.0))
 
 
 def normal_lcdf(mu, sigma, x):
     """Compute the log of the cumulative density function of the normal."""
     z = (x - mu) / sigma
-    return at.switch(
-        at.lt(z, -1.0),
-        at.log(at.erfcx(-z / at.sqrt(2.0)) / 2.0) - at.sqr(z) / 2.0,
-        at.log1p(-at.erfc(z / at.sqrt(2.0)) / 2.0),
+    return pt.switch(
+        pt.lt(z, -1.0),
+        pt.log(pt.erfcx(-z / pt.sqrt(2.0)) / 2.0) - pt.sqr(z) / 2.0,
+        pt.log1p(-pt.erfc(z / pt.sqrt(2.0)) / 2.0),
     )
 
 
 def normal_lccdf(mu, sigma, x):
     z = (x - mu) / sigma
-    return at.switch(
-        at.gt(z, 1.0),
-        at.log(at.erfcx(z / at.sqrt(2.0)) / 2.0) - at.sqr(z) / 2.0,
-        at.log1p(-at.erfc(-z / at.sqrt(2.0)) / 2.0),
+    return pt.switch(
+        pt.gt(z, 1.0),
+        pt.log(pt.erfcx(z / pt.sqrt(2.0)) / 2.0) - pt.sqr(z) / 2.0,
+        pt.log1p(-pt.erfc(-z / pt.sqrt(2.0)) / 2.0),
     )
 
 
@@ -135,21 +160,21 @@ def log_diff_normal_cdf(mu, sigma, x, y):
     log (\\Phi(x) - \\Phi(y))
 
     """
-    x = (x - mu) / sigma / at.sqrt(2.0)
-    y = (y - mu) / sigma / at.sqrt(2.0)
+    x = (x - mu) / sigma / pt.sqrt(2.0)
+    y = (y - mu) / sigma / pt.sqrt(2.0)
 
     # To stabilize the computation, consider these three regions:
     # 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y)
     # 2) 0 > x > y => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y)
     # 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine.
-    return at.log(0.5) + at.switch(
-        at.gt(y, 0),
-        -at.square(y) + at.log(at.erfcx(y) - at.exp(at.square(y) - at.square(x)) * at.erfcx(x)),
-        at.switch(
-            at.lt(x, 0),  # 0 > x > y
-            -at.square(x)
-            + at.log(at.erfcx(-x) - at.exp(at.square(x) - at.square(y)) * at.erfcx(-y)),
-            at.log(at.erf(x) - at.erf(y)),  # x >0 > y
+    return pt.log(0.5) + pt.switch(
+        pt.gt(y, 0),
+        -pt.square(y) + pt.log(pt.erfcx(y) - pt.exp(pt.square(y) - pt.square(x)) * pt.erfcx(x)),
+        pt.switch(
+            pt.lt(x, 0),  # 0 > x > y
+            -pt.square(x)
+            + pt.log(pt.erfcx(-x) - pt.exp(pt.square(x) - pt.square(y)) * pt.erfcx(-y)),
+            pt.log(pt.erf(x) - pt.erf(y)),  # x >0 > y
         ),
     )
 
@@ -158,14 +183,14 @@ def sigma2rho(sigma):
     """
     `sigma -> rho` PyTensor converter
     :math:`mu + sigma*e = mu + log(1+exp(rho))*e`"""
-    return at.log(at.exp(at.abs(sigma)) - 1.0)
+    return pt.log(pt.exp(pt.abs(sigma)) - 1.0)
 
 
 def rho2sigma(rho):
     """
     `rho -> sigma` PyTensor converter
     :math:`mu + sigma*e = mu + log(1+exp(rho))*e`"""
-    return at.softplus(rho)
+    return pt.softplus(rho)
 
 
 rho2sd = rho2sigma
@@ -208,13 +233,13 @@ def log_normal(x, mean, **kwargs):
     if sigma is not None:
         std = sigma
     elif w is not None:
-        std = at.exp(w)
+        std = pt.exp(w)
     elif rho is not None:
         std = rho2sigma(rho)
     else:
         std = tau ** (-1)
     std += f(eps)
-    return f(c) - at.log(at.abs(std)) - (x - mean) ** 2 / (2.0 * std**2)
+    return f(c) - pt.log(pt.abs(std)) - (x - mean) ** 2 / (2.0 * std**2)
 
 
 def MvNormalLogp():
@@ -224,14 +249,14 @@ def MvNormalLogp():
 
     Parameters
     ----------
-    cov: at.matrix
+    cov: pt.matrix
         The covariance matrix.
-    delta: at.matrix
+    delta: pt.matrix
         Array of deviations from the mean.
     """
-    cov = at.matrix("cov")
+    cov = pt.matrix("cov")
     cov.tag.test_value = floatX(np.eye(3))
-    delta = at.matrix("delta")
+    delta = pt.matrix("delta")
     delta.tag.test_value = floatX(np.zeros((2, 3)))
 
     cholesky = Cholesky(lower=True, on_error="nan")
@@ -239,17 +264,17 @@ def MvNormalLogp():
     n, k = delta.shape
     n, k = f(n), f(k)
     chol_cov = cholesky(cov)
-    diag = at.diag(chol_cov)
-    ok = at.all(diag > 0)
+    diag = pt.diag(chol_cov)
+    ok = pt.all(diag > 0)
 
-    chol_cov = at.switch(ok, chol_cov, at.fill(chol_cov, 1))
+    chol_cov = pt.switch(ok, chol_cov, pt.fill(chol_cov, 1))
     delta_trans = solve_lower(chol_cov, delta.T).T
 
-    result = n * k * at.log(f(2) * np.pi)
-    result += f(2) * n * at.sum(at.log(diag))
+    result = n * k * pt.log(f(2) * np.pi)
+    result += f(2) * n * pt.sum(pt.log(diag))
     result += (delta_trans ** f(2)).sum()
     result = f(-0.5) * result
-    logp = at.switch(ok, result, -np.inf)
+    logp = pt.switch(ok, result, -np.inf)
 
     def dlogp(inputs, gradients):
         (g_logp,) = gradients
@@ -259,21 +284,21 @@ def dlogp(inputs, gradients):
         n, k = delta.shape
 
         chol_cov = cholesky(cov)
-        diag = at.diag(chol_cov)
-        ok = at.all(diag > 0)
+        diag = pt.diag(chol_cov)
+        ok = pt.all(diag > 0)
 
-        chol_cov = at.switch(ok, chol_cov, at.fill(chol_cov, 1))
+        chol_cov = pt.switch(ok, chol_cov, pt.fill(chol_cov, 1))
         delta_trans = solve_lower(chol_cov, delta.T).T
 
-        inner = n * at.eye(k) - at.dot(delta_trans.T, delta_trans)
+        inner = n * pt.eye(k) - pt.dot(delta_trans.T, delta_trans)
         g_cov = solve_upper(chol_cov.T, inner)
         g_cov = solve_upper(chol_cov.T, g_cov.T)
 
         tau_delta = solve_upper(chol_cov.T, delta_trans.T)
         g_delta = tau_delta.T
 
-        g_cov = at.switch(ok, g_cov, -np.nan)
-        g_delta = at.switch(ok, g_delta, -np.nan)
+        g_cov = pt.switch(ok, g_cov, -np.nan)
+        g_delta = pt.switch(ok, g_delta, -np.nan)
 
         return [-0.5 * g_cov * g_logp, -g_delta * g_logp]
 
@@ -291,7 +316,7 @@ def __init__(self, spline):
         self.spline = spline
 
     def make_node(self, x):
-        x = at.as_tensor_variable(x)
+        x = pt.as_tensor_variable(x)
         return Apply(self, [x], [x.type()])
 
     @property
@@ -345,7 +370,7 @@ def impl(self, x):
     def grad(self, inp, grads):
         (x,) = inp
         (gz,) = grads
-        return (gz * (i1e_scalar(x) - pytensor.scalar.sgn(x) * i0e_scalar(x)),)
+        return (gz * (i1e_scalar(x) - pytensor.scalar.sign(x) * i0e_scalar(x)),)
 
 
 i0e_scalar = I0e(upgrade_to_float_no_complex, name="i0e")
@@ -447,17 +472,17 @@ def multigammaln(a, p):
     p: int
        degrees of freedom. p > 0
     """
-    i = at.arange(1, p + 1)
-    return p * (p - 1) * at.log(np.pi) / 4.0 + at.sum(gammaln(a + (1.0 - i) / 2.0), axis=0)
+    i = pt.arange(1, p + 1)
+    return p * (p - 1) * pt.log(np.pi) / 4.0 + pt.sum(gammaln(a + (1.0 - i) / 2.0), axis=0)
 
 
 def log_i0(x):
     """
     Calculates the logarithm of the 0 order modified Bessel function of the first kind""
     """
-    return at.switch(
-        at.lt(x, 5),
-        at.log1p(
+    return pt.switch(
+        pt.lt(x, 5),
+        pt.log1p(
             x**2.0 / 4.0
             + x**4.0 / 64.0
             + x**6.0 / 2304.0
@@ -466,8 +491,8 @@ def log_i0(x):
             + x**12.0 / 2123366400.0
         ),
         x
-        - 0.5 * at.log(2.0 * np.pi * x)
-        + at.log1p(
+        - 0.5 * pt.log(2.0 * np.pi * x)
+        + pt.log1p(
             1.0 / (8.0 * x)
             + 9.0 / (128.0 * x**2.0)
             + 225.0 / (3072.0 * x**3.0)
@@ -482,4 +507,4 @@ def incomplete_beta(a, b, value):
         FutureWarning,
         stacklevel=2,
     )
-    return at.betainc(a, b, value)
+    return pt.betainc(a, b, value)
diff --git a/pymc/distributions/distribution.py b/pymc/distributions/distribution.py
index 9a47cf1390..0a003c6598 100644
--- a/pymc/distributions/distribution.py
+++ b/pymc/distributions/distribution.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 
-from pytensor import tensor as at
+from pytensor import tensor as pt
 from pytensor.compile.builders import OpFromGraph
 from pytensor.graph import node_rewriter
 from pytensor.graph.basic import Node, Variable
@@ -488,7 +488,6 @@ class _CustomDist(Distribution):
     def dist(
         cls,
         *dist_params,
-        class_name: str,
         logp: Optional[Callable] = None,
         logcdf: Optional[Callable] = None,
         random: Optional[Callable] = None,
@@ -496,6 +495,7 @@ def dist(
         ndim_supp: int = 0,
         ndims_params: Optional[Sequence[int]] = None,
         dtype: str = "floatX",
+        class_name: str = "CustomDist",
         **kwargs,
     ):
         dist_params = [as_tensor_variable(param) for param in dist_params]
@@ -523,7 +523,6 @@ def dist(
 
         return super().dist(
             dist_params,
-            class_name=class_name,
             logp=logp,
             logcdf=logcdf,
             random=random,
@@ -531,6 +530,7 @@ def dist(
             ndim_supp=ndim_supp,
             ndims_params=ndims_params,
             dtype=dtype,
+            class_name=class_name,
             **kwargs,
         )
 
@@ -538,7 +538,6 @@ def dist(
     def rv_op(
         cls,
         *dist_params,
-        class_name: str,
         logp: Optional[Callable],
         logcdf: Optional[Callable],
         random: Optional[Callable],
@@ -546,13 +545,14 @@ def rv_op(
         ndim_supp: int,
         ndims_params: Optional[Sequence[int]],
         dtype: str,
+        class_name: str,
         **kwargs,
     ):
         rv_type = type(
-            f"CustomDistRV_{class_name}",
+            class_name,
             (CustomDistRV,),
             dict(
-                name=f"CustomDist_{class_name}",
+                name=class_name,
                 inplace=False,
                 ndim_supp=ndim_supp,
                 ndims_params=ndims_params,
@@ -593,7 +593,9 @@ class CustomSymbolicDistRV(SymbolicRandomVariable):
 
     def update(self, node: Node):
         op = node.op
-        inner_updates = collect_default_updates(op.inner_inputs, op.inner_outputs)
+        inner_updates = collect_default_updates(
+            op.inner_inputs, op.inner_outputs, must_be_shared=False
+        )
 
         # Map inner updates to outer inputs/outputs
         updates = {}
@@ -611,20 +613,15 @@ class _CustomSymbolicDist(Distribution):
     def dist(
         cls,
         *dist_params,
-        class_name: str,
         dist: Callable,
         logp: Optional[Callable] = None,
         logcdf: Optional[Callable] = None,
         moment: Optional[Callable] = None,
         ndim_supp: int = 0,
         dtype: str = "floatX",
+        class_name: str = "CustomSymbolicDist",
         **kwargs,
     ):
-        warnings.warn(
-            "CustomDist with dist function is still experimental. Expect bugs!",
-            UserWarning,
-        )
-
         dist_params = [as_tensor_variable(param) for param in dist_params]
 
         if logcdf is None:
@@ -653,13 +650,13 @@ def dist(
     def rv_op(
         cls,
         *dist_params,
-        class_name: str,
         dist: Callable,
         logp: Optional[Callable],
         logcdf: Optional[Callable],
         moment: Optional[Callable],
         size=None,
         ndim_supp: int,
+        class_name: str,
     ):
         size = normalize_size_param(size)
         dummy_size_param = size.type()
@@ -672,7 +669,7 @@ def rv_op(
         dummy_updates_dict = collect_default_updates(dummy_params, (dummy_rv,))
 
         rv_type = type(
-            f"CustomSymbolicDistRV_{class_name}",
+            class_name,
             (CustomSymbolicDistRV,),
             # If logp is not provided, we try to infer it from the dist graph
             dict(
@@ -687,9 +684,11 @@ def rv_op(
             def custom_dist_logp(op, values, size, *params, **kwargs):
                 return logp(values[0], *params[: len(dist_params)])
 
-        @_logcdf.register(rv_type)
-        def custom_dist_logcdf(op, value, size, *params, **kwargs):
-            return logcdf(value, *params[: len(dist_params)])
+        if logcdf is not None:
+
+            @_logcdf.register(rv_type)
+            def custom_dist_logcdf(op, value, size, *params, **kwargs):
+                return logcdf(value, *params[: len(dist_params)])
 
         @_moment.register(rv_type)
         def custom_dist_get_moment(op, rv, size, *params):
@@ -703,7 +702,7 @@ def change_custom_symbolic_dist_size(op, rv, new_size, expand):
                 shape = tuple(rv.shape)
                 old_size = shape[: len(shape) - node.op.ndim_supp]
                 new_size = tuple(new_size) + tuple(old_size)
-            new_size = at.as_tensor(new_size, ndim=1, dtype="int64")
+            new_size = pt.as_tensor(new_size, ndim=1, dtype="int64")
 
             old_size, *old_dist_params = node.inputs[: len(dist_params) + 1]
 
@@ -756,15 +755,6 @@ class CustomDist:
     dist_params : Tuple
         A sequence of the distribution's parameter. These will be converted into
         Pytensor tensor variables internally.
-    class_name : str
-        Name for the class which will wrap the CustomDist methods. When not specified,
-        it will be given the name of the model variable.
-
-        .. warning:: New CustomDists created with the same class_name will override the
-            methods dispatched onto the previous classes. If using CustomDists with
-            different methods across separate models, be sure to use distinct
-            class_names.
-
     dist: Optional[Callable]
         A callable that returns a PyTensor graph built from simpler PyMC distributions
         which represents the distribution. This can be used by PyMC to take random draws
@@ -829,6 +819,9 @@ class CustomDist:
         The dtype of the distribution. All draws and observations passed into the
         distribution will be cast onto this dtype. This is not needed if an PyTensor
         dist function is provided, which should already return the right dtype!
+    class_name : str
+        Name for the class which will wrap the CustomDist methods. When not specified,
+        it will be given the name of the model variable.
     kwargs :
         Extra keyword arguments are passed to the parent's class ``__new__`` method.
 
@@ -977,10 +970,10 @@ def __new__(
         dist_params = cls.parse_dist_params(dist_params)
         cls.check_valid_dist_random(dist, random, dist_params)
         if dist is not None:
+            kwargs.setdefault("class_name", f"CustomSymbolicDist_{name}")
             return _CustomSymbolicDist(
                 name,
                 *dist_params,
-                class_name=name,
                 dist=dist,
                 logp=logp,
                 logcdf=logcdf,
@@ -988,25 +981,25 @@ def __new__(
                 ndim_supp=ndim_supp,
                 **kwargs,
             )
-        return _CustomDist(
-            name,
-            *dist_params,
-            class_name=name,
-            random=random,
-            logp=logp,
-            logcdf=logcdf,
-            moment=moment,
-            ndim_supp=ndim_supp,
-            ndims_params=ndims_params,
-            dtype=dtype,
-            **kwargs,
-        )
+        else:
+            kwargs.setdefault("class_name", f"CustomDist_{name}")
+            return _CustomDist(
+                name,
+                *dist_params,
+                random=random,
+                logp=logp,
+                logcdf=logcdf,
+                moment=moment,
+                ndim_supp=ndim_supp,
+                ndims_params=ndims_params,
+                dtype=dtype,
+                **kwargs,
+            )
 
     @classmethod
     def dist(
         cls,
         *dist_params,
-        class_name: str,
         dist: Optional[Callable] = None,
         random: Optional[Callable] = None,
         logp: Optional[Callable] = None,
@@ -1022,7 +1015,6 @@ def dist(
         if dist is not None:
             return _CustomSymbolicDist.dist(
                 *dist_params,
-                class_name=class_name,
                 dist=dist,
                 logp=logp,
                 logcdf=logcdf,
@@ -1033,7 +1025,6 @@ def dist(
         else:
             return _CustomDist.dist(
                 *dist_params,
-                class_name=class_name,
                 random=random,
                 logp=logp,
                 logcdf=logcdf,
@@ -1105,9 +1096,9 @@ def func(*args, **kwargs):
 
 def default_moment(rv, size, *rv_inputs, rv_name=None, has_fallback=False, ndim_supp=0):
     if ndim_supp == 0:
-        return at.zeros(size, dtype=rv.dtype)
+        return pt.zeros(size, dtype=rv.dtype)
     elif has_fallback:
-        return at.zeros_like(rv)
+        return pt.zeros_like(rv)
     else:
         raise TypeError(
             "Cannot safely infer the size of a multivariate random variable's moment. "
diff --git a/pymc/distributions/mixture.py b/pymc/distributions/mixture.py
index 70b92630ff..769dd3a671 100644
--- a/pymc/distributions/mixture.py
+++ b/pymc/distributions/mixture.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.basic import Node, equal_computations
 from pytensor.tensor import TensorVariable
@@ -32,8 +32,7 @@
 )
 from pymc.distributions.shape_utils import _change_dist_size, change_dist_size
 from pymc.distributions.transforms import _default_transform
-from pymc.logprob.abstract import _logcdf, _logprob, logcdf
-from pymc.logprob.joint_logprob import logp
+from pymc.logprob.abstract import _logcdf, _logcdf_helper, _logprob, _logprob_helper
 from pymc.logprob.transforms import IntervalTransform
 from pymc.logprob.utils import ignore_logprob
 from pymc.util import check_dist_not_registered
@@ -205,7 +204,7 @@ def dist(cls, w, comp_dists, **kwargs):
                 f"Mixture components must all have the same support dimensionality, got {components_ndim_supp}"
             )
 
-        w = at.as_tensor_variable(w)
+        w = pt.as_tensor_variable(w)
         return super().dist([w, *comp_dists], **kwargs)
 
     @classmethod
@@ -220,7 +219,7 @@ def rv_op(cls, weights, *components, size=None):
             components = cls._resize_components(size, *components)
         elif not single_component:
             # We might need to broadcast components when size is not specified
-            shape = tuple(at.broadcast_shape(*components))
+            shape = tuple(pt.broadcast_shape(*components))
             size = shape[: len(shape) - ndim_supp]
             components = cls._resize_components(size, *components)
 
@@ -238,7 +237,7 @@ def rv_op(cls, weights, *components, size=None):
         # we try to resize them. This in necessary to avoid duplicated values in the
         # random method and for equivalency with the logp method
         if weights_ndim_batch:
-            new_size = at.concatenate(
+            new_size = pt.concatenate(
                 [
                     weights.shape[:weights_ndim_batch],
                     components[0].shape[:ndim_batch],
@@ -271,19 +270,19 @@ def rv_op(cls, weights, *components, size=None):
             # If single component, we consider it as being already "stacked"
             stacked_components_ = components_[0]
         else:
-            stacked_components_ = at.stack(components_, axis=mix_axis)
+            stacked_components_ = pt.stack(components_, axis=mix_axis)
 
         # Broadcast weights to (*batched dimensions, stack dimension), ignoring support dimensions
         weights_broadcast_shape_ = stacked_components_.shape[: ndim_batch + 1]
-        weights_broadcasted_ = at.broadcast_to(weights_, weights_broadcast_shape_)
+        weights_broadcasted_ = pt.broadcast_to(weights_, weights_broadcast_shape_)
 
         # Draw mixture indexes and append (stack + ndim_supp) broadcastable dimensions to the right
-        mix_indexes_ = at.random.categorical(weights_broadcasted_, rng=mix_indexes_rng_)
-        mix_indexes_padded_ = at.shape_padright(mix_indexes_, ndim_supp + 1)
+        mix_indexes_ = pt.random.categorical(weights_broadcasted_, rng=mix_indexes_rng_)
+        mix_indexes_padded_ = pt.shape_padright(mix_indexes_, ndim_supp + 1)
 
         # Index components and squeeze mixture dimension
-        mix_out_ = at.take_along_axis(stacked_components_, mix_indexes_padded_, axis=mix_axis)
-        mix_out_ = at.squeeze(mix_out_, axis=mix_axis)
+        mix_out_ = pt.take_along_axis(stacked_components_, mix_indexes_padded_, axis=mix_axis)
+        mix_out_ = pt.squeeze(mix_out_, axis=mix_axis)
 
         # Output mix_indexes rng update so that it can be updated in place
         mix_indexes_rng_next_ = mix_indexes_.owner.outputs[0]
@@ -337,20 +336,20 @@ def marginal_mixture_logprob(op, values, rng, weights, *components, **kwargs):
     if len(components) == 1:
         # Need to broadcast value across mixture axis
         mix_axis = -components[0].owner.op.ndim_supp - 1
-        components_logp = logp(components[0], at.expand_dims(value, mix_axis))
+        components_logp = _logprob_helper(components[0], pt.expand_dims(value, mix_axis))
     else:
-        components_logp = at.stack(
-            [logp(component, value) for component in components],
+        components_logp = pt.stack(
+            [_logprob_helper(component, value) for component in components],
             axis=-1,
         )
 
-    mix_logp = at.logsumexp(at.log(weights) + components_logp, axis=-1)
+    mix_logp = pt.logsumexp(pt.log(weights) + components_logp, axis=-1)
 
     mix_logp = check_parameters(
         mix_logp,
         0 <= weights,
         weights <= 1,
-        at.isclose(at.sum(weights, axis=-1), 1),
+        pt.isclose(pt.sum(weights, axis=-1), 1),
         msg="0 <= weights <= 1, sum(weights) == 1",
     )
 
@@ -363,20 +362,20 @@ def marginal_mixture_logcdf(op, value, rng, weights, *components, **kwargs):
     if len(components) == 1:
         # Need to broadcast value across mixture axis
         mix_axis = -components[0].owner.op.ndim_supp - 1
-        components_logcdf = logcdf(components[0], at.expand_dims(value, mix_axis))
+        components_logcdf = _logcdf_helper(components[0], pt.expand_dims(value, mix_axis))
     else:
-        components_logcdf = at.stack(
-            [logcdf(component, value) for component in components],
+        components_logcdf = pt.stack(
+            [_logcdf_helper(component, value) for component in components],
             axis=-1,
         )
 
-    mix_logcdf = at.logsumexp(at.log(weights) + components_logcdf, axis=-1)
+    mix_logcdf = pt.logsumexp(pt.log(weights) + components_logcdf, axis=-1)
 
     mix_logcdf = check_parameters(
         mix_logcdf,
         0 <= weights,
         weights <= 1,
-        at.isclose(at.sum(weights, axis=-1), 1),
+        pt.isclose(pt.sum(weights, axis=-1), 1),
         msg="0 <= weights <= 1, sum(weights) == 1",
     )
 
@@ -386,21 +385,21 @@ def marginal_mixture_logcdf(op, value, rng, weights, *components, **kwargs):
 @_moment.register(MarginalMixtureRV)
 def marginal_mixture_moment(op, rv, rng, weights, *components):
     ndim_supp = components[0].owner.op.ndim_supp
-    weights = at.shape_padright(weights, ndim_supp)
+    weights = pt.shape_padright(weights, ndim_supp)
     mix_axis = -ndim_supp - 1
 
     if len(components) == 1:
         moment_components = moment(components[0])
 
     else:
-        moment_components = at.stack(
+        moment_components = pt.stack(
             [moment(component) for component in components],
             axis=mix_axis,
         )
 
-    mix_moment = at.sum(weights * moment_components, axis=mix_axis)
+    mix_moment = pt.sum(weights * moment_components, axis=mix_axis)
     if components[0].dtype in discrete_types:
-        mix_moment = at.round(mix_moment)
+        mix_moment = pt.round(mix_moment)
     return mix_moment
 
 
diff --git a/pymc/distributions/multivariate.py b/pymc/distributions/multivariate.py
index edba23f3fe..b3ddde25bf 100644
--- a/pymc/distributions/multivariate.py
+++ b/pymc/distributions/multivariate.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import scipy
 
 from pytensor.graph.basic import Apply, Constant, Variable
@@ -59,7 +59,7 @@
 )
 from pymc.distributions.shape_utils import (
     _change_dist_size,
-    broadcast_dist_samples_to,
+    broadcast_dist_samples_shape,
     change_dist_size,
     get_support_shape,
     rv_size_is_none,
@@ -118,11 +118,11 @@ def quaddist_matrix(cov=None, chol=None, tau=None, lower=True, *args, **kwargs):
         raise ValueError("Incompatible parameterization. Specify exactly one of tau, cov, or chol.")
 
     if cov is not None:
-        cov = at.as_tensor_variable(cov)
+        cov = pt.as_tensor_variable(cov)
         if cov.ndim != 2:
             raise ValueError("cov must be two dimensional.")
     elif tau is not None:
-        tau = at.as_tensor_variable(tau)
+        tau = pt.as_tensor_variable(tau)
         if tau.ndim != 2:
             raise ValueError("tau must be two dimensional.")
         # TODO: What's the correct order/approach (in the non-square case)?
@@ -130,7 +130,7 @@ def quaddist_matrix(cov=None, chol=None, tau=None, lower=True, *args, **kwargs):
         cov = matrix_inverse(tau)
     else:
         # TODO: What's the correct order/approach (in the non-square case)?
-        chol = at.as_tensor_variable(chol)
+        chol = pt.as_tensor_variable(chol)
         if chol.ndim != 2:
             raise ValueError("chol must be two dimensional.")
         cov = chol.dot(chol.T)
@@ -163,30 +163,30 @@ def quaddist_parse(value, mu, cov, mat_type="cov"):
 
 
 def quaddist_chol(delta, chol_mat):
-    diag = at.diag(chol_mat)
+    diag = pt.diag(chol_mat)
     # Check if the covariance matrix is positive definite.
-    ok = at.all(diag > 0)
+    ok = pt.all(diag > 0)
     # If not, replace the diagonal. We return -inf later, but
     # need to prevent solve_lower from throwing an exception.
-    chol_cov = at.switch(ok, chol_mat, 1)
+    chol_cov = pt.switch(ok, chol_mat, 1)
 
     delta_trans = solve_lower(chol_cov, delta.T).T
     quaddist = (delta_trans**2).sum(axis=-1)
-    logdet = at.sum(at.log(diag))
+    logdet = pt.sum(pt.log(diag))
     return quaddist, logdet, ok
 
 
 def quaddist_tau(delta, chol_mat):
-    diag = at.nlinalg.diag(chol_mat)
+    diag = pt.nlinalg.diag(chol_mat)
     # Check if the precision matrix is positive definite.
-    ok = at.all(diag > 0)
+    ok = pt.all(diag > 0)
     # If not, replace the diagonal. We return -inf later, but
     # need to prevent solve_lower from throwing an exception.
-    chol_tau = at.switch(ok, chol_mat, 1)
+    chol_tau = pt.switch(ok, chol_mat, 1)
 
-    delta_trans = at.dot(delta, chol_tau)
+    delta_trans = pt.dot(delta, chol_tau)
     quaddist = (delta_trans**2).sum(axis=-1)
-    logdet = -at.sum(at.log(diag))
+    logdet = -pt.sum(pt.log(diag))
     return quaddist, logdet, ok
 
 
@@ -252,23 +252,23 @@ class MvNormal(Continuous):
         chol, _, _ = pm.LKJCholeskyCov('chol_cov', n=3, eta=2,
             sd_dist=sd_dist, compute_corr=True)
         vals_raw = pm.Normal('vals_raw', mu=0, sigma=1, shape=(5, 3))
-        vals = pm.Deterministic('vals', at.dot(chol, vals_raw.T).T)
+        vals = pm.Deterministic('vals', pt.dot(chol, vals_raw.T).T)
     """
     rv_op = multivariate_normal
 
     @classmethod
     def dist(cls, mu, cov=None, tau=None, chol=None, lower=True, **kwargs):
-        mu = at.as_tensor_variable(mu)
+        mu = pt.as_tensor_variable(mu)
         cov = quaddist_matrix(cov, chol, tau, lower)
         # PyTensor is stricter about the shape of mu, than PyMC used to be
-        mu = at.broadcast_arrays(mu, cov[..., -1])[0]
+        mu = pt.broadcast_arrays(mu, cov[..., -1])[0]
         return super().dist([mu, cov], **kwargs)
 
     def moment(rv, size, mu, cov):
         moment = mu
         if not rv_size_is_none(size):
-            moment_size = at.concatenate([size, [mu.shape[-1]]])
-            moment = at.full(moment_size, mu)
+            moment_size = pt.concatenate([size, [mu.shape[-1]]])
+            moment = pt.full(moment_size, mu)
         return moment
 
     def logp(value, mu, cov):
@@ -303,7 +303,7 @@ class MvStudentTRV(RandomVariable):
     _print_name = ("MvStudentT", "\\operatorname{MvStudentT}")
 
     def make_node(self, rng, size, dtype, nu, mu, cov):
-        nu = at.as_tensor_variable(nu)
+        nu = pt.as_tensor_variable(nu)
         if not nu.ndim == 0:
             raise ValueError("nu must be a scalar (ndim=0).")
 
@@ -395,19 +395,19 @@ def dist(cls, nu, Sigma=None, mu=None, scale=None, tau=None, chol=None, lower=Tr
             if scale is not None:
                 raise ValueError("Specify only one of scale and Sigma")
             scale = Sigma
-        nu = at.as_tensor_variable(floatX(nu))
-        mu = at.as_tensor_variable(floatX(mu))
+        nu = pt.as_tensor_variable(floatX(nu))
+        mu = pt.as_tensor_variable(floatX(mu))
         scale = quaddist_matrix(scale, chol, tau, lower)
         # PyTensor is stricter about the shape of mu, than PyMC used to be
-        mu = at.broadcast_arrays(mu, scale[..., -1])[0]
+        mu = pt.broadcast_arrays(mu, scale[..., -1])[0]
 
         return super().dist([nu, mu, scale], **kwargs)
 
     def moment(rv, size, nu, mu, scale):
         moment = mu
         if not rv_size_is_none(size):
-            moment_size = at.concatenate([size, [mu.shape[-1]]])
-            moment = at.full(moment_size, moment)
+            moment_size = pt.concatenate([size, [mu.shape[-1]]])
+            moment = pt.full(moment_size, moment)
         return moment
 
     def logp(value, nu, mu, scale):
@@ -427,8 +427,8 @@ def logp(value, nu, mu, scale):
         quaddist, logdet, ok = quaddist_parse(value, mu, scale)
         k = floatX(value.shape[-1])
 
-        norm = gammaln((nu + k) / 2.0) - gammaln(nu / 2.0) - 0.5 * k * at.log(nu * np.pi)
-        inner = -(nu + k) / 2.0 * at.log1p(quaddist / nu)
+        norm = gammaln((nu + k) / 2.0) - gammaln(nu / 2.0) - 0.5 * k * pt.log(nu * np.pi)
+        inner = -(nu + k) / 2.0 * pt.log1p(quaddist / nu)
         res = norm + inner - logdet
 
         return check_parameters(res, ok, nu > 0, msg="posdef, nu > 0")
@@ -462,17 +462,17 @@ class Dirichlet(SimplexContinuous):
 
     @classmethod
     def dist(cls, a, **kwargs):
-        a = at.as_tensor_variable(a)
-        # mean = a / at.sum(a)
-        # mode = at.switch(at.all(a > 1), (a - 1) / at.sum(a - 1), np.nan)
+        a = pt.as_tensor_variable(a)
+        # mean = a / pt.sum(a)
+        # mode = pt.switch(pt.all(a > 1), (a - 1) / pt.sum(a - 1), np.nan)
 
         return super().dist([a], **kwargs)
 
     def moment(rv, size, a):
-        norm_constant = at.sum(a, axis=-1)[..., None]
+        norm_constant = pt.sum(a, axis=-1)[..., None]
         moment = a / norm_constant
         if not rv_size_is_none(size):
-            moment = at.full(at.concatenate([size, [a.shape[-1]]]), moment)
+            moment = pt.full(pt.concatenate([size, [a.shape[-1]]]), moment)
         return moment
 
     def logp(value, a):
@@ -490,11 +490,11 @@ def logp(value, a):
         TensorVariable
         """
         # only defined for sum(value) == 1
-        res = at.sum(logpow(value, a - 1) - gammaln(a), axis=-1) + gammaln(at.sum(a, axis=-1))
-        res = at.switch(
-            at.or_(
-                at.any(at.lt(value, 0), axis=-1),
-                at.any(at.gt(value, 1), axis=-1),
+        res = pt.sum(logpow(value, a - 1) - gammaln(a), axis=-1) + gammaln(pt.sum(a, axis=-1))
+        res = pt.switch(
+            pt.or_(
+                pt.any(pt.lt(value, 0), axis=-1),
+                pt.any(pt.gt(value, 1), axis=-1),
             ),
             -np.inf,
             res,
@@ -541,7 +541,7 @@ class Multinomial(Discrete):
 
     @classmethod
     def dist(cls, n, p, *args, **kwargs):
-        p = at.as_tensor_variable(p)
+        p = pt.as_tensor_variable(p)
         if isinstance(p, TensorConstant):
             p_ = np.asarray(p.data)
             if np.any(p_ < 0):
@@ -554,21 +554,21 @@ def dist(cls, n, p, *args, **kwargs):
                     "You can rescale them directly to get rid of this warning.",
                     UserWarning,
                 )
-                p_ = p_ / at.sum(p_, axis=-1, keepdims=True)
-                p = at.as_tensor_variable(p_)
-        n = at.as_tensor_variable(n)
-        p = at.as_tensor_variable(p)
+                p_ = p_ / pt.sum(p_, axis=-1, keepdims=True)
+                p = pt.as_tensor_variable(p_)
+        n = pt.as_tensor_variable(n)
+        p = pt.as_tensor_variable(p)
         return super().dist([n, p], *args, **kwargs)
 
     def moment(rv, size, n, p):
-        n = at.shape_padright(n)
-        mode = at.round(n * p)
-        diff = n - at.sum(mode, axis=-1, keepdims=True)
-        inc_bool_arr = at.abs(diff) > 0
-        mode = at.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()])
+        n = pt.shape_padright(n)
+        mode = pt.round(n * p)
+        diff = n - pt.sum(mode, axis=-1, keepdims=True)
+        inc_bool_arr = pt.abs(diff) > 0
+        mode = pt.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()])
         if not rv_size_is_none(size):
-            output_size = at.concatenate([size, [p.shape[-1]]])
-            mode = at.full(output_size, mode)
+            output_size = pt.concatenate([size, [p.shape[-1]]])
+            mode = pt.full(output_size, mode)
         return mode
 
     def logp(value, n, p):
@@ -586,9 +586,9 @@ def logp(value, n, p):
         TensorVariable
         """
 
-        res = factln(n) + at.sum(-factln(value) + logpow(p, value), axis=-1)
-        res = at.switch(
-            at.or_(at.any(at.lt(value, 0), axis=-1), at.neq(at.sum(value, axis=-1), n)),
+        res = factln(n) + pt.sum(-factln(value) + logpow(p, value), axis=-1)
+        res = pt.switch(
+            pt.or_(pt.any(pt.lt(value, 0), axis=-1), pt.neq(pt.sum(value, axis=-1), n)),
             -np.inf,
             res,
         )
@@ -596,8 +596,8 @@ def logp(value, n, p):
             res,
             0 <= p,
             p <= 1,
-            at.isclose(at.sum(p, axis=-1), 1),
-            at.ge(n, 0),
+            pt.isclose(pt.sum(p, axis=-1), 1),
+            pt.ge(n, 0),
             msg="0 <= p <= 1, sum(p) = 1, n >= 0",
         )
 
@@ -681,7 +681,7 @@ def dist(cls, n, a, *args, **kwargs):
         return super().dist([n, a], **kwargs)
 
     def moment(rv, size, n, a):
-        p = a / at.sum(a, axis=-1, keepdims=True)
+        p = a / pt.sum(a, axis=-1, keepdims=True)
         return moment(Multinomial.dist(n=n, p=p, size=size))
 
     def logp(value, n, a):
@@ -703,10 +703,10 @@ def logp(value, n, a):
         series = gammaln(value + a) - (gammaln(value + 1) + gammaln(a))
         res = const + series.sum(axis=-1)
 
-        res = at.switch(
-            at.or_(
-                at.any(at.lt(value, 0), axis=-1),
-                at.neq(at.sum(value, axis=-1), n),
+        res = pt.switch(
+            pt.or_(
+                pt.any(pt.lt(value, 0), axis=-1),
+                pt.neq(pt.sum(value, axis=-1), n),
             ),
             -np.inf,
             res,
@@ -729,16 +729,16 @@ class _OrderedMultinomial(Multinomial):
 
     @classmethod
     def dist(cls, eta, cutpoints, n, *args, **kwargs):
-        eta = at.as_tensor_variable(floatX(eta))
-        cutpoints = at.as_tensor_variable(cutpoints)
-        n = at.as_tensor_variable(intX(n))
+        eta = pt.as_tensor_variable(floatX(eta))
+        cutpoints = pt.as_tensor_variable(cutpoints)
+        n = pt.as_tensor_variable(intX(n))
 
-        pa = sigmoid(cutpoints - at.shape_padright(eta))
-        p_cum = at.concatenate(
+        pa = sigmoid(cutpoints - pt.shape_padright(eta))
+        p_cum = pt.concatenate(
             [
-                at.zeros_like(at.shape_padright(pa[..., 0])),
+                pt.zeros_like(pt.shape_padright(pa[..., 0])),
                 pa,
-                at.ones_like(at.shape_padright(pa[..., 0])),
+                pt.ones_like(pt.shape_padright(pa[..., 0])),
             ],
             axis=-1,
         )
@@ -855,7 +855,7 @@ class PosDefMatrix(Op):
     # Compulsory if itypes and otypes are not defined
 
     def make_node(self, x):
-        x = at.as_tensor_variable(x)
+        x = pt.as_tensor_variable(x)
         assert x.ndim == 2
         o = TensorType(dtype="int8", shape=[])()
         return Apply(self, [x], [o])
@@ -948,8 +948,8 @@ class Wishart(Continuous):
 
     @classmethod
     def dist(cls, nu, V, *args, **kwargs):
-        nu = at.as_tensor_variable(intX(nu))
-        V = at.as_tensor_variable(floatX(V))
+        nu = pt.as_tensor_variable(intX(nu))
+        V = pt.as_tensor_variable(floatX(V))
 
         warnings.warn(
             "The Wishart distribution can currently not be used "
@@ -963,7 +963,7 @@ def dist(cls, nu, V, *args, **kwargs):
 
         # mean = nu * V
         # p = V.shape[0]
-        # mode = at.switch(at.ge(nu, p + 1), (nu - p - 1) * V, np.nan)
+        # mode = pt.switch(pt.ge(nu, p + 1), (nu - p - 1) * V, np.nan)
         return super().dist([nu, V], *args, **kwargs)
 
     def logp(X, nu, V):
@@ -988,15 +988,15 @@ def logp(X, nu, V):
 
         return check_parameters(
             (
-                (nu - p - 1) * at.log(IXI)
+                (nu - p - 1) * pt.log(IXI)
                 - trace(matrix_inverse(V).dot(X))
-                - nu * p * at.log(2)
-                - nu * at.log(IVI)
+                - nu * p * pt.log(2)
+                - nu * pt.log(IVI)
                 - 2 * multigammaln(nu / 2.0, p)
             )
             / 2,
             matrix_pos_def(X),
-            at.eq(X, X.T),
+            pt.eq(X, X.T),
             nu > (p - 1),
         )
 
@@ -1065,22 +1065,22 @@ def WishartBartlett(name, S, nu, is_cholesky=False, return_cholesky=False, initv
         diag_testval = None
         tril_testval = None
 
-    c = at.sqrt(
+    c = pt.sqrt(
         ChiSquared("%s_c" % name, nu - np.arange(2, 2 + n_diag), shape=n_diag, initval=diag_testval)
     )
     pm._log.info("Added new variable %s_c to model diagonal of Wishart." % name)
     z = Normal("%s_z" % name, 0.0, 1.0, shape=n_tril, initval=tril_testval)
     pm._log.info("Added new variable %s_z to model off-diagonals of Wishart." % name)
     # Construct A matrix
-    A = at.zeros(S.shape, dtype=np.float32)
-    A = at.set_subtensor(A[diag_idx], c)
-    A = at.set_subtensor(A[tril_idx], z)
+    A = pt.zeros(S.shape, dtype=np.float32)
+    A = pt.set_subtensor(A[diag_idx], c)
+    A = pt.set_subtensor(A[tril_idx], z)
 
     # L * A * A.T * L.T ~ Wishart(L*L.T, nu)
     if return_cholesky:
-        return pm.Deterministic(name, at.dot(L, A))
+        return pm.Deterministic(name, pt.dot(L, A))
     else:
-        return pm.Deterministic(name, at.dot(at.dot(at.dot(L, A), A.T), L.T))
+        return pm.Deterministic(name, pt.dot(pt.dot(pt.dot(L, A), A.T), L.T))
 
 
 def _lkj_normalizing_constant(eta, n):
@@ -1090,24 +1090,24 @@ def _lkj_normalizing_constant(eta, n):
     if not isinstance(n, int):
         raise NotImplementedError("n must be an integer")
     if eta == 1:
-        result = gammaln(2.0 * at.arange(1, int((n - 1) / 2) + 1)).sum()
+        result = gammaln(2.0 * pt.arange(1, int((n - 1) / 2) + 1)).sum()
         if n % 2 == 1:
             result += (
-                0.25 * (n**2 - 1) * at.log(np.pi)
-                - 0.25 * (n - 1) ** 2 * at.log(2.0)
+                0.25 * (n**2 - 1) * pt.log(np.pi)
+                - 0.25 * (n - 1) ** 2 * pt.log(2.0)
                 - (n - 1) * gammaln(int((n + 1) / 2))
             )
         else:
             result += (
-                0.25 * n * (n - 2) * at.log(np.pi)
-                + 0.25 * (3 * n**2 - 4 * n) * at.log(2.0)
+                0.25 * n * (n - 2) * pt.log(np.pi)
+                + 0.25 * (3 * n**2 - 4 * n) * pt.log(2.0)
                 + n * gammaln(n / 2)
                 - (n - 1) * gammaln(n)
             )
     else:
         result = -(n - 1) * gammaln(eta + 0.5 * (n - 1))
-        k = at.arange(1, n)
-        result += (0.5 * k * at.log(np.pi) + gammaln(eta + 0.5 * (n - 1 - k))).sum()
+        k = pt.arange(1, n)
+        result += (0.5 * k * pt.log(np.pi) + gammaln(eta + 0.5 * (n - 1 - k))).sum()
     return result
 
 
@@ -1119,15 +1119,15 @@ class _LKJCholeskyCovBaseRV(RandomVariable):
     _print_name = ("_lkjcholeskycovbase", "\\operatorname{_lkjcholeskycovbase}")
 
     def make_node(self, rng, size, dtype, n, eta, D):
-        n = at.as_tensor_variable(n)
+        n = pt.as_tensor_variable(n)
         if not n.ndim == 0:
             raise ValueError("n must be a scalar (ndim=0).")
 
-        eta = at.as_tensor_variable(eta)
+        eta = pt.as_tensor_variable(eta)
         if not eta.ndim == 0:
             raise ValueError("eta must be a scalar (ndim=0).")
 
-        D = at.as_tensor_variable(D)
+        D = pt.as_tensor_variable(D)
 
         return super().make_node(rng, size, dtype, n, eta, D)
 
@@ -1179,8 +1179,8 @@ class _LKJCholeskyCov(Distribution):
 
     @classmethod
     def dist(cls, n, eta, sd_dist, **kwargs):
-        n = at.as_tensor_variable(intX(n))
-        eta = at.as_tensor_variable(floatX(eta))
+        n = pt.as_tensor_variable(intX(n))
+        eta = pt.as_tensor_variable(floatX(eta))
 
         if not (
             isinstance(sd_dist, Variable)
@@ -1239,9 +1239,9 @@ def change_LKJCholeksyCovRV_size(op, dist, new_size, expand=False):
 
 @_moment.register(_LKJCholeskyCovRV)
 def _LKJCholeksyCovRV_moment(op, rv, rng, n, eta, sd_dist):
-    diag_idxs = (at.cumsum(at.arange(1, n + 1)) - 1).astype("int32")
-    moment = at.zeros_like(rv)
-    moment = at.set_subtensor(moment[..., diag_idxs], 1)
+    diag_idxs = (pt.cumsum(pt.arange(1, n + 1)) - 1).astype("int32")
+    moment = pt.zeros_like(rv)
+    moment = pt.set_subtensor(moment[..., diag_idxs], 1)
     return moment
 
 
@@ -1258,23 +1258,23 @@ def _LKJCholeksyCovRV_logp(op, values, rng, n, eta, sd_dist, **kwargs):
     if value.ndim > 1:
         raise ValueError("_LKJCholeskyCov logp is only implemented for vector values (ndim=1)")
 
-    diag_idxs = at.cumsum(at.arange(1, n + 1)) - 1
-    cumsum = at.cumsum(value**2)
-    variance = at.zeros(at.atleast_1d(n))
-    variance = at.inc_subtensor(variance[0], value[0] ** 2)
-    variance = at.inc_subtensor(variance[1:], cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]])
-    sd_vals = at.sqrt(variance)
+    diag_idxs = pt.cumsum(pt.arange(1, n + 1)) - 1
+    cumsum = pt.cumsum(value**2)
+    variance = pt.zeros(pt.atleast_1d(n))
+    variance = pt.inc_subtensor(variance[0], value[0] ** 2)
+    variance = pt.inc_subtensor(variance[1:], cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]])
+    sd_vals = pt.sqrt(variance)
 
     logp_sd = pm.logp(sd_dist, sd_vals).sum()
     corr_diag = value[diag_idxs] / sd_vals
 
-    logp_lkj = (2 * eta - 3 + n - at.arange(n)) * at.log(corr_diag)
-    logp_lkj = at.sum(logp_lkj)
+    logp_lkj = (2 * eta - 3 + n - pt.arange(n)) * pt.log(corr_diag)
+    logp_lkj = pt.sum(logp_lkj)
 
     # Compute the log det jacobian of the second transformation
     # described in the docstring.
-    idx = at.arange(n)
-    det_invjac = at.log(corr_diag) - idx * at.log(sd_vals)
+    idx = pt.arange(n)
+    det_invjac = pt.log(corr_diag) - idx * pt.log(sd_vals)
     det_invjac = det_invjac.sum()
 
     # TODO: _lkj_normalizing_constant currently requires `eta` and `n` to be constants
@@ -1377,10 +1377,10 @@ class LKJCholeskyCov:
 
             # Or transform an uncorrelated normal:
             vals_raw = pm.Normal('vals_raw', mu=0, sigma=1, shape=10)
-            vals = at.dot(chol, vals_raw)
+            vals = pt.dot(chol, vals_raw)
 
             # Or compute the covariance matrix
-            cov = at.dot(chol, chol.T)
+            cov = pt.dot(chol, chol.T)
 
     **Implementation** In the unconstrained space all values of the cholesky factor
     are stored untransformed, except for the diagonal entries, where
@@ -1457,9 +1457,9 @@ def dist(cls, eta, n, sd_dist, *, compute_corr=True, **kwargs):
     def helper_deterministics(cls, n, packed_chol):
         chol = pm.expand_packed_triangular(n, packed_chol, lower=True)
         # compute covariance matrix
-        cov = at.dot(chol, chol.T)
+        cov = pt.dot(chol, chol.T)
         # extract standard deviations and rho
-        stds = at.sqrt(at.diag(cov))
+        stds = pt.sqrt(pt.diag(cov))
         inv_stds = 1 / stds
         corr = inv_stds[None, :] * cov * inv_stds[:, None]
         return chol, corr, stds
@@ -1473,11 +1473,11 @@ class LKJCorrRV(RandomVariable):
     _print_name = ("LKJCorrRV", "\\operatorname{LKJCorrRV}")
 
     def make_node(self, rng, size, dtype, n, eta):
-        n = at.as_tensor_variable(n)
+        n = pt.as_tensor_variable(n)
         if not n.ndim == 0:
             raise ValueError("n must be a scalar (ndim=0).")
 
-        eta = at.as_tensor_variable(eta)
+        eta = pt.as_tensor_variable(eta)
         if not eta.ndim == 0:
             raise ValueError("eta must be a scalar (ndim=0).")
 
@@ -1577,12 +1577,12 @@ class LKJCorr(BoundedContinuous):
 
     @classmethod
     def dist(cls, n, eta, **kwargs):
-        n = at.as_tensor_variable(intX(n))
-        eta = at.as_tensor_variable(floatX(eta))
+        n = pt.as_tensor_variable(intX(n))
+        eta = pt.as_tensor_variable(floatX(eta))
         return super().dist([n, eta], **kwargs)
 
     def moment(rv, *args):
-        return at.zeros_like(rv)
+        return pt.zeros_like(rv)
 
     def logp(value, n, eta):
         """
@@ -1610,15 +1610,15 @@ def logp(value, n, eta):
         tri_index[np.triu_indices(n, k=1)] = np.arange(shape)
         tri_index[np.triu_indices(n, k=1)[::-1]] = np.arange(shape)
 
-        value = at.take(value, tri_index)
-        value = at.fill_diagonal(value, 1)
+        value = pt.take(value, tri_index)
+        value = pt.fill_diagonal(value, 1)
 
         # TODO: _lkj_normalizing_constant currently requires `eta` and `n` to be constants
         if not isinstance(eta, Constant):
             raise NotImplementedError("logp only implemented for constant `eta`")
         eta = float(eta.data)
         result = _lkj_normalizing_constant(eta, n)
-        result += (eta - 1.0) * at.log(det(value))
+        result += (eta - 1.0) * pt.log(det(value))
         return check_parameters(
             result,
             value >= -1,
@@ -1651,7 +1651,9 @@ def rng_fn(cls, rng, mu, rowchol, colchol, size=None):
         output_shape = size + dist_shape
 
         # Broadcasting all parameters
-        (mu,) = broadcast_dist_samples_to(to_shape=output_shape, samples=[mu], size=size)
+        shapes = [mu.shape, output_shape]
+        broadcastable_shape = broadcast_dist_samples_shape(shapes, size=size)
+        mu = np.broadcast_to(mu, shape=broadcastable_shape)
         rowchol = np.broadcast_to(rowchol, shape=size + rowchol.shape[-2:])
 
         colchol = np.broadcast_to(colchol, shape=size + colchol.shape[-2:])
@@ -1749,7 +1751,7 @@ class MatrixNormal(Continuous):
 
             # Setup left covariance matrix
             scale = pm.LogNormal('scale', mu=np.log(true_scale), sigma=0.5)
-            rowcov = at.diag([scale**(2*i) for i in range(m)])
+            rowcov = pt.diag([scale**(2*i) for i in range(m)])
 
             vals = pm.MatrixNormal('vals', mu=mu, colchol=colchol, rowcov=rowcov,
                                    observed=data)
@@ -1781,7 +1783,7 @@ def dist(
         else:
             if rowchol.ndim != 2:
                 raise ValueError("rowchol must be two dimensional.")
-            rowchol_cov = at.as_tensor_variable(rowchol)
+            rowchol_cov = pt.as_tensor_variable(rowchol)
 
         # Among-column matrices
         if len([i for i in [colcov, colchol] if i is not None]) != 1:
@@ -1789,25 +1791,25 @@ def dist(
                 "Incompatible parameterization. Specify exactly one of colcov, or colchol."
             )
         if colcov is not None:
-            colcov = at.as_tensor_variable(colcov)
+            colcov = pt.as_tensor_variable(colcov)
             if colcov.ndim != 2:
                 raise ValueError("colcov must be two dimensional.")
             colchol_cov = cholesky(colcov)
         else:
             if colchol.ndim != 2:
                 raise ValueError("colchol must be two dimensional.")
-            colchol_cov = at.as_tensor_variable(colchol)
+            colchol_cov = pt.as_tensor_variable(colchol)
 
         dist_shape = (rowchol_cov.shape[-1], colchol_cov.shape[-1])
 
         # Broadcasting mu
-        mu = at.extra_ops.broadcast_to(mu, shape=dist_shape)
-        mu = at.as_tensor_variable(floatX(mu))
+        mu = pt.extra_ops.broadcast_to(mu, shape=dist_shape)
+        mu = pt.as_tensor_variable(floatX(mu))
 
         return super().dist([mu, rowchol_cov, colchol_cov], **kwargs)
 
     def moment(rv, size, mu, rowchol, colchol):
-        return at.full_like(rv, mu)
+        return pt.full_like(rv, mu)
 
     def logp(value, mu, rowchol, colchol):
         """
@@ -1833,15 +1835,15 @@ def logp(value, mu, rowchol, colchol):
 
         # Find exponent piece by piece
         right_quaddist = solve_lower(rowchol, delta)
-        quaddist = at.nlinalg.matrix_dot(right_quaddist.T, right_quaddist)
+        quaddist = pt.nlinalg.matrix_dot(right_quaddist.T, right_quaddist)
         quaddist = solve_lower(colchol, quaddist)
         quaddist = solve_upper(colchol.T, quaddist)
-        trquaddist = at.nlinalg.trace(quaddist)
+        trquaddist = pt.nlinalg.trace(quaddist)
 
-        coldiag = at.diag(colchol)
-        rowdiag = at.diag(rowchol)
-        half_collogdet = at.sum(at.log(coldiag))  # logdet(M) = 2*Tr(log(L))
-        half_rowlogdet = at.sum(at.log(rowdiag))  # Using Cholesky: M = L L^T
+        coldiag = pt.diag(colchol)
+        rowdiag = pt.diag(rowchol)
+        half_collogdet = pt.sum(pt.log(coldiag))  # logdet(M) = 2*Tr(log(L))
+        half_rowlogdet = pt.sum(pt.log(rowdiag))  # Using Cholesky: M = L L^T
 
         m = rowchol.shape[0]
         n = colchol.shape[0]
@@ -1904,7 +1906,7 @@ class KroneckerNormal(Continuous):
         :math:`[(v_1, Q_1), (v_2, Q_2), ...]` such that
         :math:`K_i = Q_i \text{diag}(v_i) Q_i'`. For example::
 
-            v_i, Q_i = at.nlinalg.eigh(K_i)
+            v_i, Q_i = pt.nlinalg.eigh(K_i)
     sigma : scalar, optional
         Standard deviation of the Gaussian white noise.
 
@@ -1978,18 +1980,18 @@ def dist(cls, mu, covs=None, chols=None, evds=None, sigma=None, *args, **kwargs)
             covs = []
             eigs_sep, Qs = zip(*eigh_iterable)  # Unzip
             for eig, Q in zip(eigs_sep, Qs):
-                cov_i = at.dot(Q, at.dot(at.diag(eig), Q.T))
+                cov_i = pt.dot(Q, pt.dot(pt.diag(eig), Q.T))
                 covs.append(cov_i)
 
-        mu = at.as_tensor_variable(mu)
+        mu = pt.as_tensor_variable(mu)
 
         return super().dist([mu, sigma, *covs], **kwargs)
 
     def moment(rv, size, mu, covs, chols, evds):
         mean = mu
         if not rv_size_is_none(size):
-            moment_size = at.concatenate([size, mu.shape])
-            mean = at.full(moment_size, mu)
+            moment_size = pt.concatenate([size, mu.shape])
+            mean = pt.full(moment_size, mu)
         return mean
 
     def logp(value, mu, sigma, *covs):
@@ -2019,24 +2021,24 @@ def logp(value, mu, sigma, *covs):
 
         eigh_iterable = map(eigh, covs)
         eigs_sep, Qs = zip(*eigh_iterable)  # Unzip
-        Qs = list(map(at.as_tensor_variable, Qs))
-        QTs = list(map(at.transpose, Qs))
+        Qs = list(map(pt.as_tensor_variable, Qs))
+        QTs = list(map(pt.transpose, Qs))
 
-        eigs_sep = list(map(at.as_tensor_variable, eigs_sep))
+        eigs_sep = list(map(pt.as_tensor_variable, eigs_sep))
         eigs = kron_diag(*eigs_sep)  # Combine separate eigs
         eigs += sigma**2
         N = eigs.shape[0]
 
         sqrt_quad = kron_dot(QTs, delta.T)
-        sqrt_quad = sqrt_quad / at.sqrt(eigs[:, None])
-        logdet = at.sum(at.log(eigs))
+        sqrt_quad = sqrt_quad / pt.sqrt(eigs[:, None])
+        logdet = pt.sum(pt.log(eigs))
 
         # Square each sample
-        quad = at.batched_dot(sqrt_quad.T, sqrt_quad.T)
+        quad = pt.batched_dot(sqrt_quad.T, sqrt_quad.T)
         if onedim:
             quad = quad[0]
 
-        a = -(quad + logdet + N * at.log(2 * np.pi)) / 2.0
+        a = -(quad + logdet + N * pt.log(2 * np.pi)) / 2.0
         return a
 
 
@@ -2048,7 +2050,7 @@ class CARRV(RandomVariable):
     _print_name = ("CAR", "\\operatorname{CAR}")
 
     def make_node(self, rng, size, dtype, mu, W, alpha, tau):
-        mu = at.as_tensor_variable(floatX(mu))
+        mu = pt.as_tensor_variable(floatX(mu))
 
         W = pytensor.sparse.as_sparse_or_tensor_variable(floatX(W))
         if not W.ndim == 2:
@@ -2057,13 +2059,13 @@ def make_node(self, rng, size, dtype, mu, W, alpha, tau):
         sparse = isinstance(W, pytensor.sparse.SparseVariable)
         msg = "W must be a symmetric adjacency matrix."
         if sparse:
-            abs_diff = pytensor.sparse.basic.mul(pytensor.sparse.basic.sgn(W - W.T), W - W.T)
-            W = Assert(msg)(W, at.isclose(pytensor.sparse.basic.sp_sum(abs_diff), 0))
+            abs_diff = pytensor.sparse.basic.mul(pytensor.sparse.sign(W - W.T), W - W.T)
+            W = Assert(msg)(W, pt.isclose(pytensor.sparse.sp_sum(abs_diff), 0))
         else:
-            W = Assert(msg)(W, at.allclose(W, W.T))
+            W = Assert(msg)(W, pt.allclose(W, W.T))
 
-        tau = at.as_tensor_variable(floatX(tau))
-        alpha = at.as_tensor_variable(floatX(alpha))
+        tau = pt.as_tensor_variable(floatX(tau))
+        alpha = pt.as_tensor_variable(floatX(alpha))
 
         return super().make_node(rng, size, dtype, mu, W, alpha, tau)
 
@@ -2164,7 +2166,7 @@ def dist(cls, mu, W, alpha, tau, *args, **kwargs):
         return super().dist([mu, W, alpha, tau], **kwargs)
 
     def moment(rv, size, mu, W, alpha, tau):
-        return at.full_like(rv, mu)
+        return pt.full_like(rv, mu)
 
     def logp(value, mu, W, alpha, tau):
         """
@@ -2187,27 +2189,27 @@ def logp(value, mu, W, alpha, tau):
 
         if sparse:
             D = sp_sum(W, axis=0)
-            Dinv_sqrt = at.diag(1 / at.sqrt(D))
-            DWD = at.dot(pytensor.sparse.dot(Dinv_sqrt, W), Dinv_sqrt)
+            Dinv_sqrt = pt.diag(1 / pt.sqrt(D))
+            DWD = pt.dot(pytensor.sparse.dot(Dinv_sqrt, W), Dinv_sqrt)
         else:
             D = W.sum(axis=0)
-            Dinv_sqrt = at.diag(1 / at.sqrt(D))
-            DWD = at.dot(at.dot(Dinv_sqrt, W), Dinv_sqrt)
-        lam = at.slinalg.eigvalsh(DWD, at.eye(DWD.shape[0]))
+            Dinv_sqrt = pt.diag(1 / pt.sqrt(D))
+            DWD = pt.dot(pt.dot(Dinv_sqrt, W), Dinv_sqrt)
+        lam = pt.slinalg.eigvalsh(DWD, pt.eye(DWD.shape[0]))
 
         d, _ = W.shape
 
         if value.ndim == 1:
             value = value[None, :]
 
-        logtau = d * at.log(tau).sum()
-        logdet = at.log(1 - alpha.T * lam[:, None]).sum()
+        logtau = d * pt.log(tau).sum()
+        logdet = pt.log(1 - alpha.T * lam[:, None]).sum()
         delta = value - mu
 
         if sparse:
             Wdelta = pytensor.sparse.dot(delta, W)
         else:
-            Wdelta = at.dot(delta, W)
+            Wdelta = pt.dot(delta, W)
 
         tau_dot_delta = D[None, :] * delta - alpha * Wdelta
         logquad = (tau * delta * tau_dot_delta).sum(axis=-1)
@@ -2228,8 +2230,8 @@ class StickBreakingWeightsRV(RandomVariable):
     _print_name = ("StickBreakingWeights", "\\operatorname{StickBreakingWeights}")
 
     def make_node(self, rng, size, dtype, alpha, K):
-        alpha = at.as_tensor_variable(alpha)
-        K = at.as_tensor_variable(intX(K))
+        alpha = pt.as_tensor_variable(alpha)
+        K = pt.as_tensor_variable(intX(K))
 
         if K.ndim > 0:
             raise ValueError("K must be a scalar.")
@@ -2310,18 +2312,18 @@ class StickBreakingWeights(SimplexContinuous):
 
     @classmethod
     def dist(cls, alpha, K, *args, **kwargs):
-        alpha = at.as_tensor_variable(floatX(alpha))
-        K = at.as_tensor_variable(intX(K))
+        alpha = pt.as_tensor_variable(floatX(alpha))
+        K = pt.as_tensor_variable(intX(K))
 
         return super().dist([alpha, K], **kwargs)
 
     def moment(rv, size, alpha, K):
         alpha = alpha[..., np.newaxis]
-        moment = (alpha / (1 + alpha)) ** at.arange(K)
+        moment = (alpha / (1 + alpha)) ** pt.arange(K)
         moment *= 1 / (1 + alpha)
-        moment = at.concatenate([moment, (alpha / (1 + alpha)) ** K], axis=-1)
+        moment = pt.concatenate([moment, (alpha / (1 + alpha)) ** K], axis=-1)
         if not rv_size_is_none(size):
-            moment_size = at.concatenate(
+            moment_size = pt.concatenate(
                 [
                     size,
                     [
@@ -2329,7 +2331,7 @@ def moment(rv, size, alpha, K):
                     ],
                 ]
             )
-            moment = at.full(moment_size, moment)
+            moment = pt.full(moment_size, moment)
 
         return moment
 
@@ -2347,9 +2349,9 @@ def logp(value, alpha, K):
         -------
         TensorVariable
         """
-        logp = -at.sum(
-            at.log(
-                at.cumsum(
+        logp = -pt.sum(
+            pt.log(
+                pt.cumsum(
                     value[..., ::-1],
                     axis=-1,
                 )
@@ -2357,17 +2359,17 @@ def logp(value, alpha, K):
             axis=-1,
         )
         logp += -K * betaln(1, alpha)
-        logp += alpha * at.log(value[..., -1])
+        logp += alpha * pt.log(value[..., -1])
 
-        logp = at.switch(
-            at.or_(
-                at.any(
-                    at.and_(at.le(value, 0), at.ge(value, 1)),
+        logp = pt.switch(
+            pt.or_(
+                pt.any(
+                    pt.and_(pt.le(value, 0), pt.ge(value, 1)),
                     axis=-1,
                 ),
-                at.or_(
-                    at.bitwise_not(at.allclose(value.sum(-1), 1)),
-                    at.neq(value.shape[-1], K + 1),
+                pt.or_(
+                    pt.bitwise_not(pt.allclose(value.sum(-1), 1)),
+                    pt.neq(value.shape[-1], K + 1),
                 ),
             ),
             -np.inf,
@@ -2486,7 +2488,7 @@ def __new__(
     def dist(cls, sigma=1, n_zerosum_axes=None, support_shape=None, **kwargs):
         n_zerosum_axes = cls.check_zerosum_axes(n_zerosum_axes)
 
-        sigma = at.as_tensor_variable(floatX(sigma))
+        sigma = pt.as_tensor_variable(floatX(sigma))
         if sigma.ndim > 0:
             raise ValueError("sigma has to be a scalar")
 
@@ -2499,12 +2501,12 @@ def dist(cls, sigma=1, n_zerosum_axes=None, support_shape=None, **kwargs):
         if support_shape is None:
             if n_zerosum_axes > 0:
                 raise ValueError("You must specify dims, shape or support_shape parameter")
-            # TODO: edge-case doesn't work for now, because at.stack in get_support_shape fails
+            # TODO: edge-case doesn't work for now, because pt.stack in get_support_shape fails
             # else:
             #     support_shape = () # because it's just a Normal in that case
-        support_shape = at.as_tensor_variable(intX(support_shape))
+        support_shape = pt.as_tensor_variable(intX(support_shape))
 
-        assert n_zerosum_axes == at.get_vector_length(
+        assert n_zerosum_axes == pt.get_vector_length(
             support_shape
         ), "support_shape has to be as long as n_zerosum_axes"
 
@@ -2564,7 +2566,7 @@ def change_zerosum_size(op, normal_dist, new_size, expand=False):
 
 @_moment.register(ZeroSumNormalRV)
 def zerosumnormal_moment(op, rv, *rv_inputs):
-    return at.zeros_like(rv)
+    return pt.zeros_like(rv)
 
 
 @_default_transform.register(ZeroSumNormalRV)
@@ -2579,16 +2581,16 @@ def zerosumnormal_logp(op, values, normal_dist, sigma, support_shape, **kwargs):
     shape = value.shape
     n_zerosum_axes = op.ndim_supp
 
-    _deg_free_support_shape = at.inc_subtensor(shape[-n_zerosum_axes:], -1)
-    _full_size = at.prod(shape)
-    _degrees_of_freedom = at.prod(_deg_free_support_shape)
+    _deg_free_support_shape = pt.inc_subtensor(shape[-n_zerosum_axes:], -1)
+    _full_size = pt.prod(shape)
+    _degrees_of_freedom = pt.prod(_deg_free_support_shape)
 
     zerosums = [
-        at.all(at.isclose(at.mean(value, axis=-axis - 1), 0, atol=1e-9))
+        pt.all(pt.isclose(pt.mean(value, axis=-axis - 1), 0, atol=1e-9))
         for axis in range(n_zerosum_axes)
     ]
 
-    out = at.sum(
+    out = pt.sum(
         pm.logp(normal_dist, value) * _degrees_of_freedom / _full_size,
         axis=tuple(np.arange(-n_zerosum_axes, 0)),
     )
diff --git a/pymc/distributions/shape_utils.py b/pymc/distributions/shape_utils.py
index 29fdf57a99..2987ec444c 100644
--- a/pymc/distributions/shape_utils.py
+++ b/pymc/distributions/shape_utils.py
@@ -25,7 +25,7 @@
 import numpy as np
 
 from pytensor import config
-from pytensor import tensor as at
+from pytensor import tensor as pt
 from pytensor.graph.basic import Variable
 from pytensor.graph.op import Op, compute_test_value
 from pytensor.raise_op import Assert
@@ -38,12 +38,8 @@
 from pymc.pytensorf import convert_observed_data
 
 __all__ = [
-    "to_tuple",
-    "shapes_broadcasting",
     "broadcast_dist_samples_shape",
-    "get_broadcastable_dist_samples",
-    "broadcast_distribution_samples",
-    "broadcast_dist_samples_to",
+    "to_tuple",
     "rv_size_is_none",
     "change_dist_size",
 ]
@@ -91,47 +87,6 @@ def _check_shape_type(shape):
     return tuple(out)
 
 
-def shapes_broadcasting(*args, raise_exception=False):
-    """Return the shape resulting from broadcasting multiple shapes.
-    Represents numpy's broadcasting rules.
-
-    Parameters
-    ----------
-    *args: array-like of int
-        Tuples or arrays or lists representing the shapes of arrays to be
-        broadcast.
-    raise_exception: bool (optional)
-        Controls whether to raise an exception or simply return `None` if
-        the broadcasting fails.
-
-    Returns
-    -------
-    Resulting shape. If broadcasting is not possible and `raise_exception` is
-    False, then `None` is returned. If `raise_exception` is `True`, a
-    `ValueError` is raised.
-    """
-    x = list(_check_shape_type(args[0])) if args else ()
-    for arg in args[1:]:
-        y = list(_check_shape_type(arg))
-        if len(x) < len(y):
-            x, y = y, x
-        if len(y) > 0:
-            x[-len(y) :] = [
-                j if i == 1 else i if j == 1 else i if i == j else 0
-                for i, j in zip(x[-len(y) :], y)
-            ]
-        if not all(x):
-            if raise_exception:
-                raise ValueError(
-                    "Supplied shapes {} do not broadcast together".format(
-                        ", ".join([f"{a}" for a in args])
-                    )
-                )
-            else:
-                return None
-    return tuple(x)
-
-
 def broadcast_dist_samples_shape(shapes, size=None):
     """Apply shape broadcasting to shape tuples but assuming that the shapes
     correspond to draws from random variables, with the `size` tuple possibly
@@ -152,7 +107,6 @@ def broadcast_dist_samples_shape(shapes, size=None):
     Examples
     --------
     .. code-block:: python
-
         size = 100
         shape0 = (size,)
         shape1 = (size, 5)
@@ -160,9 +114,7 @@ def broadcast_dist_samples_shape(shapes, size=None):
         out = broadcast_dist_samples_shape([shape0, shape1, shape2],
                                            size=size)
         assert out == (size, 4, 5)
-
     .. code-block:: python
-
         size = 100
         shape0 = (size,)
         shape1 = (5,)
@@ -170,9 +122,7 @@ def broadcast_dist_samples_shape(shapes, size=None):
         out = broadcast_dist_samples_shape([shape0, shape1, shape2],
                                            size=size)
         assert out == (size, 4, 5)
-
     .. code-block:: python
-
         size = 100
         shape0 = (1,)
         shape1 = (5,)
@@ -182,7 +132,7 @@ def broadcast_dist_samples_shape(shapes, size=None):
         assert out == (4, 5)
     """
     if size is None:
-        broadcasted_shape = shapes_broadcasting(*shapes)
+        broadcasted_shape = np.broadcast_shapes(*shapes)
         if broadcasted_shape is None:
             raise ValueError(
                 "Cannot broadcast provided shapes {} given size: {}".format(
@@ -195,7 +145,7 @@ def broadcast_dist_samples_shape(shapes, size=None):
     # samples shapes without the size prepend
     sp_shapes = [s[len(_size) :] if _size == s[: min([len(_size), len(s)])] else s for s in shapes]
     try:
-        broadcast_shape = shapes_broadcasting(*sp_shapes, raise_exception=True)
+        broadcast_shape = np.broadcast_shapes(*sp_shapes)
     except ValueError:
         raise ValueError(
             "Cannot broadcast provided shapes {} given size: {}".format(
@@ -215,212 +165,7 @@ def broadcast_dist_samples_shape(shapes, size=None):
         else:
             p_shape = shape
         broadcastable_shapes.append(p_shape)
-    return shapes_broadcasting(*broadcastable_shapes, raise_exception=True)
-
-
-def get_broadcastable_dist_samples(
-    samples, size=None, must_bcast_with=None, return_out_shape=False
-):
-    """Get a view of the samples drawn from distributions which adds new axes
-    in between the `size` prepend and the distribution's `shape`. These views
-    should be able to broadcast the samples from the distrubtions taking into
-    account the `size` (i.e. the number of samples) of the draw, which is
-    prepended to the sample's `shape`. Optionally, one can supply an extra
-    `must_bcast_with` to try to force samples to be able to broadcast with a
-    given shape. A `ValueError` is raised if it is not possible to broadcast
-    the provided samples.
-
-    Parameters
-    ----------
-    samples: Iterable of ndarrays holding the sampled values
-    size: None, int or tuple (optional)
-        size of the sample set requested.
-    must_bcast_with: None, int or tuple (optional)
-        Tuple shape to which the samples must be able to broadcast
-    return_out_shape: bool (optional)
-        If `True`, this function also returns the output's shape and not only
-        samples views.
-
-    Returns
-    -------
-    broadcastable_samples: List of the broadcasted sample arrays
-    broadcast_shape: If `return_out_shape` is `True`, the resulting broadcast
-        shape is returned.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        must_bcast_with = (3, 1, 5)
-        size = 100
-        sample0 = np.random.randn(size)
-        sample1 = np.random.randn(size, 5)
-        sample2 = np.random.randn(size, 4, 5)
-        out = broadcast_dist_samples_to(
-            [sample0, sample1, sample2],
-            size=size,
-            must_bcast_with=must_bcast_with,
-        )
-        assert out[0].shape == (size, 1, 1, 1)
-        assert out[1].shape == (size, 1, 1, 5)
-        assert out[2].shape == (size, 1, 4, 5)
-        assert np.all(sample0[:, None, None, None] == out[0])
-        assert np.all(sample1[:, None, None] == out[1])
-        assert np.all(sample2[:, None] == out[2])
-
-    .. code-block:: python
-
-        size = 100
-        must_bcast_with = (3, 1, 5)
-        sample0 = np.random.randn(size)
-        sample1 = np.random.randn(5)
-        sample2 = np.random.randn(4, 5)
-        out = broadcast_dist_samples_to(
-            [sample0, sample1, sample2],
-            size=size,
-            must_bcast_with=must_bcast_with,
-        )
-        assert out[0].shape == (size, 1, 1, 1)
-        assert out[1].shape == (5,)
-        assert out[2].shape == (4, 5)
-        assert np.all(sample0[:, None, None, None] == out[0])
-        assert np.all(sample1 == out[1])
-        assert np.all(sample2 == out[2])
-    """
-    samples = [np.asarray(p) for p in samples]
-    _size = to_tuple(size)
-    must_bcast_with = to_tuple(must_bcast_with)
-    # Raw samples shapes
-    p_shapes = [p.shape for p in samples] + [_check_shape_type(must_bcast_with)]
-    out_shape = broadcast_dist_samples_shape(p_shapes, size=size)
-    # samples shapes without the size prepend
-    sp_shapes = [
-        s[len(_size) :] if _size == s[: min([len(_size), len(s)])] else s for s in p_shapes
-    ]
-    broadcast_shape = shapes_broadcasting(*sp_shapes, raise_exception=True)
-    broadcastable_samples = []
-    for param, p_shape, sp_shape in zip(samples, p_shapes, sp_shapes):
-        if _size == p_shape[: min([len(_size), len(p_shape)])]:
-            # If size prepends the shape, then we have to add broadcasting axis
-            # in the middle
-            slicer_head = [slice(None)] * len(_size)
-            slicer_tail = [np.newaxis] * (len(broadcast_shape) - len(sp_shape)) + [
-                slice(None)
-            ] * len(sp_shape)
-        else:
-            # If size does not prepend the shape, then we have leave the
-            # parameter as is
-            slicer_head = []
-            slicer_tail = [slice(None)] * len(sp_shape)
-        broadcastable_samples.append(param[tuple(slicer_head + slicer_tail)])
-    if return_out_shape:
-        return broadcastable_samples, out_shape
-    else:
-        return broadcastable_samples
-
-
-def broadcast_distribution_samples(samples, size=None):
-    """Broadcast samples drawn from distributions taking into account the
-    size (i.e. the number of samples) of the draw, which is prepended to
-    the sample's shape.
-
-    Parameters
-    ----------
-    samples: Iterable of ndarrays holding the sampled values
-    size: None, int or tuple (optional)
-        size of the sample set requested.
-
-    Returns
-    -------
-    List of broadcasted sample arrays
-
-    Examples
-    --------
-    .. code-block:: python
-
-        size = 100
-        sample0 = np.random.randn(size)
-        sample1 = np.random.randn(size, 5)
-        sample2 = np.random.randn(size, 4, 5)
-        out = broadcast_distribution_samples([sample0, sample1, sample2],
-                                             size=size)
-        assert all((o.shape == (size, 4, 5) for o in out))
-        assert np.all(sample0[:, None, None] == out[0])
-        assert np.all(sample1[:, None, :] == out[1])
-        assert np.all(sample2 == out[2])
-
-    .. code-block:: python
-
-        size = 100
-        sample0 = np.random.randn(size)
-        sample1 = np.random.randn(5)
-        sample2 = np.random.randn(4, 5)
-        out = broadcast_distribution_samples([sample0, sample1, sample2],
-                                             size=size)
-        assert all((o.shape == (size, 4, 5) for o in out))
-        assert np.all(sample0[:, None, None] == out[0])
-        assert np.all(sample1 == out[1])
-        assert np.all(sample2 == out[2])
-    """
-    return np.broadcast_arrays(*get_broadcastable_dist_samples(samples, size=size))
-
-
-def broadcast_dist_samples_to(to_shape, samples, size=None):
-    """Broadcast samples drawn from distributions to a given shape, taking into
-    account the size (i.e. the number of samples) of the draw, which is
-    prepended to the sample's shape.
-
-    Parameters
-    ----------
-    to_shape: Tuple shape onto which the samples must be able to broadcast
-    samples: Iterable of ndarrays holding the sampled values
-    size: None, int or tuple (optional)
-        size of the sample set requested.
-
-    Returns
-    -------
-    List of the broadcasted sample arrays
-
-    Examples
-    --------
-    .. code-block:: python
-
-        to_shape = (3, 1, 5)
-        size = 100
-        sample0 = np.random.randn(size)
-        sample1 = np.random.randn(size, 5)
-        sample2 = np.random.randn(size, 4, 5)
-        out = broadcast_dist_samples_to(
-            to_shape,
-            [sample0, sample1, sample2],
-            size=size
-        )
-        assert np.all((o.shape == (size, 3, 4, 5) for o in out))
-        assert np.all(sample0[:, None, None, None] == out[0])
-        assert np.all(sample1[:, None, None] == out[1])
-        assert np.all(sample2[:, None] == out[2])
-
-    .. code-block:: python
-
-        size = 100
-        to_shape = (3, 1, 5)
-        sample0 = np.random.randn(size)
-        sample1 = np.random.randn(5)
-        sample2 = np.random.randn(4, 5)
-        out = broadcast_dist_samples_to(
-            to_shape,
-            [sample0, sample1, sample2],
-            size=size
-        )
-        assert np.all((o.shape == (size, 3, 4, 5) for o in out))
-        assert np.all(sample0[:, None, None, None] == out[0])
-        assert np.all(sample1 == out[1])
-        assert np.all(sample2 == out[2])
-    """
-    samples, to_shape = get_broadcastable_dist_samples(
-        samples, size=size, must_bcast_with=to_shape, return_out_shape=True
-    )
-    return [np.broadcast_to(o, to_shape) for o in samples]
+    return np.broadcast_shapes(*broadcastable_shapes)
 
 
 # User-provided can be lazily specified as scalars
@@ -546,7 +291,7 @@ def find_size(
 
 
 def rv_size_is_none(size: Variable) -> bool:
-    """Check whether an rv size is None (ie., at.Constant([]))"""
+    """Check whether an rv size is None (ie., pt.Constant([]))"""
     return size.type.shape == (0,)  # type: ignore [attr-defined]
 
 
@@ -626,7 +371,7 @@ def change_rv_size(op, rv, new_size, expand) -> TensorVariable:
 
     # Make sure the new size is a tensor. This dtype-aware conversion helps
     # to not unnecessarily pick up a `Cast` in some cases (see #4652).
-    new_size = at.as_tensor(new_size, ndim=1, dtype="int64")
+    new_size = pt.as_tensor(new_size, ndim=1, dtype="int64")
 
     new_rv = rv_node.op(*dist_params, size=new_size, dtype=dtype)
 
@@ -662,7 +407,7 @@ def change_specify_shape_size(op, ss, new_size, expand) -> TensorVariable:
             new_shapes[-ndim_supp:] = shapes[-ndim_supp:]
 
     # specify_shape has a wrong signature https://github.com/pytensor-devs/pytensor/issues/1164
-    return at.specify_shape(new_var, new_shapes)  # type: ignore
+    return pt.specify_shape(new_var, new_shapes)  # type: ignore
 
 
 def get_support_shape(
@@ -753,13 +498,13 @@ def get_support_shape(
             cast(
                 Variable,
                 Assert(msg="support_shape does not match respective shape dimension")(
-                    inferred, at.eq(inferred, explicit)
+                    inferred, pt.eq(inferred, explicit)
                 ),
             )
             for inferred, explicit in zip(inferred_support_shape, support_shape)
         ]
 
-    return at.stack(inferred_support_shape)
+    return pt.stack(inferred_support_shape)
 
 
 def get_support_shape_1d(
diff --git a/pymc/distributions/simulator.py b/pymc/distributions/simulator.py
index 43d376226d..6c8db99933 100644
--- a/pymc/distributions/simulator.py
+++ b/pymc/distributions/simulator.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.op import Apply, Op
 from pytensor.tensor.random.op import RandomVariable
@@ -86,15 +86,6 @@ class Simulator(Distribution):
         Keyword form of ''unnamed_params''.
         One of unnamed_params or params must be provided.
         If passed both unnamed_params and params, an error is raised.
-    class_name : str
-        Name for the RandomVariable class which will wrap the Simulator methods.
-        When not specified, it will be given the name of the variable.
-
-        .. warning:: New Simulators created with the same class_name will override the
-            methods dispatched onto the previous classes. If using Simulators with
-            different methods across separate models, be sure to use distinct
-            class_names.
-
     distance : PyTensor_Op, callable or str, default "gaussian"
         Distance function. Available options are ``"gaussian"``, ``"laplace"``,
         ``"kullback_leibler"`` or a user defined function (or PyTensor_Op) that takes
@@ -123,6 +114,8 @@ class Simulator(Distribution):
         Number of minimum dimensions of each parameter of the RV. For example,
         if the Simulator accepts two scalar inputs, it should be ``[0, 0]``.
         Default to list of 0 with length equal to the number of parameters.
+    class_name : str, optional
+        Suffix name for the RandomVariable class which will wrap the Simulator methods.
 
     Examples
     --------
@@ -149,7 +142,7 @@ def simulator_fn(rng, loc, scale, size):
     rv_type = SimulatorRV
 
     def __new__(cls, name, *args, **kwargs):
-        kwargs.setdefault("class_name", name)
+        kwargs.setdefault("class_name", f"Simulator_{name}")
         return super().__new__(cls, name, *args, **kwargs)
 
     @classmethod
@@ -158,13 +151,13 @@ def dist(  # type: ignore
         fn,
         *unnamed_params,
         params=None,
-        class_name: str,
         distance="gaussian",
         sum_stat="identity",
         epsilon=1,
         ndim_supp=0,
         ndims_params=None,
         dtype="floatX",
+        class_name: str = "Simulator",
         **kwargs,
     ):
         if not isinstance(distance, Op):
@@ -188,9 +181,9 @@ def dist(  # type: ignore
             if sum_stat == "identity":
                 sum_stat = identity
             elif sum_stat == "sort":
-                sum_stat = at.sort
+                sum_stat = pt.sort
             elif sum_stat == "mean":
-                sum_stat = at.mean
+                sum_stat = pt.mean
             elif sum_stat == "median":
                 # Missing in PyTensor, see pytensor/issues/525
                 sum_stat = create_sum_stat_op_from_fn(np.median)
@@ -199,7 +192,7 @@ def dist(  # type: ignore
             else:
                 raise ValueError(f"The summary statistic {sum_stat} is not implemented")
 
-        epsilon = at.as_tensor_variable(floatX(epsilon))
+        epsilon = pt.as_tensor_variable(floatX(epsilon))
 
         if params is None:
             params = unnamed_params
@@ -213,7 +206,6 @@ def dist(  # type: ignore
 
         return super().dist(
             params,
-            class_name=class_name,
             fn=fn,
             ndim_supp=ndim_supp,
             ndims_params=ndims_params,
@@ -221,6 +213,7 @@ def dist(  # type: ignore
             distance=distance,
             sum_stat=sum_stat,
             epsilon=epsilon,
+            class_name=class_name,
             **kwargs,
         )
 
@@ -228,7 +221,6 @@ def dist(  # type: ignore
     def rv_op(
         cls,
         *params,
-        class_name,
         fn,
         ndim_supp,
         ndims_params,
@@ -236,13 +228,14 @@ def rv_op(
         distance,
         sum_stat,
         epsilon,
+        class_name,
         **kwargs,
     ):
         sim_op = type(
-            f"Simulator_{class_name}",
+            class_name,
             (SimulatorRV,),
             dict(
-                name=f"Simulator_{class_name}",
+                name=class_name,
                 ndim_supp=ndim_supp,
                 ndims_params=ndims_params,
                 dtype=dtype,
@@ -260,8 +253,8 @@ def rv_op(
 def simulator_moment(op, rv, *inputs):
     sim_inputs = inputs[3:]
     # Take the mean of 10 draws
-    multiple_sim = rv.owner.op(*sim_inputs, size=at.concatenate([[10], rv.shape]))
-    return at.mean(multiple_sim, axis=0)
+    multiple_sim = rv.owner.op(*sim_inputs, size=pt.concatenate([[10], rv.shape]))
+    return pt.mean(multiple_sim, axis=0)
 
 
 @_logprob.register(SimulatorRV)
@@ -296,7 +289,7 @@ def gaussian(epsilon, obs_data, sim_data):
 
 def laplace(epsilon, obs_data, sim_data):
     """Laplace kernel."""
-    return -at.abs((obs_data - sim_data) / epsilon)
+    return -pt.abs((obs_data - sim_data) / epsilon)
 
 
 class KullbackLeibler:
@@ -320,7 +313,7 @@ def __call__(self, epsilon, obs_data, sim_data):
 
 
 def create_sum_stat_op_from_fn(fn):
-    vectorX = at.dvector if pytensor.config.floatX == "float64" else at.fvector
+    vectorX = pt.dvector if pytensor.config.floatX == "float64" else pt.fvector
 
     # Check if callable returns TensorVariable with dummy inputs
     try:
@@ -333,7 +326,7 @@ def create_sum_stat_op_from_fn(fn):
     # Otherwise, automatically wrap in PyTensor Op
     class SumStat(Op):
         def make_node(self, x):
-            x = at.as_tensor_variable(x)
+            x = pt.as_tensor_variable(x)
             return Apply(self, [x], [vectorX()])
 
         def perform(self, node, inputs, outputs):
@@ -344,8 +337,8 @@ def perform(self, node, inputs, outputs):
 
 
 def create_distance_op_from_fn(fn):
-    scalarX = at.dscalar if pytensor.config.floatX == "float64" else at.fscalar
-    vectorX = at.dvector if pytensor.config.floatX == "float64" else at.fvector
+    scalarX = pt.dscalar if pytensor.config.floatX == "float64" else pt.fscalar
+    vectorX = pt.dvector if pytensor.config.floatX == "float64" else pt.fvector
 
     # Check if callable returns TensorVariable with dummy inputs
     try:
@@ -358,9 +351,9 @@ def create_distance_op_from_fn(fn):
     # Otherwise, automatically wrap in PyTensor Op
     class Distance(Op):
         def make_node(self, epsilon, obs_data, sim_data):
-            epsilon = at.as_tensor_variable(epsilon)
-            obs_data = at.as_tensor_variable(obs_data)
-            sim_data = at.as_tensor_variable(sim_data)
+            epsilon = pt.as_tensor_variable(epsilon)
+            obs_data = pt.as_tensor_variable(obs_data)
+            sim_data = pt.as_tensor_variable(sim_data)
             return Apply(self, [epsilon, obs_data, sim_data], [vectorX()])
 
         def perform(self, node, inputs, outputs):
diff --git a/pymc/distributions/timeseries.py b/pymc/distributions/timeseries.py
index a958bbee46..4a1707b4c9 100644
--- a/pymc/distributions/timeseries.py
+++ b/pymc/distributions/timeseries.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.basic import Node, ancestors
 from pytensor.graph.replace import clone_replace
@@ -42,7 +42,7 @@
 )
 from pymc.exceptions import NotConstantValueError
 from pymc.logprob.abstract import _logprob
-from pymc.logprob.joint_logprob import logp
+from pymc.logprob.basic import logp
 from pymc.logprob.utils import ignore_logprob, reconsider_logprob
 from pymc.pytensorf import constant_fold, floatX, intX
 from pymc.util import check_dist_not_registered
@@ -85,9 +85,9 @@ def __new__(cls, *args, innovation_dist, steps=None, **kwargs):
         return super().__new__(cls, *args, innovation_dist=innovation_dist, steps=steps, **kwargs)
 
     @classmethod
-    def dist(cls, init_dist, innovation_dist, steps=None, **kwargs) -> at.TensorVariable:
+    def dist(cls, init_dist, innovation_dist, steps=None, **kwargs) -> pt.TensorVariable:
         if not (
-            isinstance(init_dist, at.TensorVariable)
+            isinstance(init_dist, pt.TensorVariable)
             and init_dist.owner is not None
             and isinstance(init_dist.owner.op, (RandomVariable, SymbolicRandomVariable))
         ):
@@ -95,7 +95,7 @@ def dist(cls, init_dist, innovation_dist, steps=None, **kwargs) -> at.TensorVari
         check_dist_not_registered(init_dist)
 
         if not (
-            isinstance(innovation_dist, at.TensorVariable)
+            isinstance(innovation_dist, pt.TensorVariable)
             and innovation_dist.owner is not None
             and isinstance(innovation_dist.owner.op, (RandomVariable, SymbolicRandomVariable))
         ):
@@ -125,7 +125,7 @@ def dist(cls, init_dist, innovation_dist, steps=None, **kwargs) -> at.TensorVari
         )
         if steps is None:
             raise ValueError("Must specify steps or shape parameter")
-        steps = at.as_tensor_variable(intX(steps))
+        steps = pt.as_tensor_variable(intX(steps))
 
         return super().dist([init_dist, innovation_dist, steps], **kwargs)
 
@@ -133,7 +133,7 @@ def dist(cls, init_dist, innovation_dist, steps=None, **kwargs) -> at.TensorVari
     def get_steps(cls, innovation_dist, steps, shape, dims, observed):
         # We need to know the ndim_supp of the innovation_dist
         if not (
-            isinstance(innovation_dist, at.TensorVariable)
+            isinstance(innovation_dist, pt.TensorVariable)
             and innovation_dist.owner is not None
             and isinstance(innovation_dist.owner.op, (RandomVariable, SymbolicRandomVariable))
         ):
@@ -173,7 +173,7 @@ def rv_op(cls, init_dist, innovation_dist, steps, size=None):
 
         # If not explicit, size is determined by the shapes of the input distributions
         if size is None:
-            size = at.broadcast_shape(
+            size = pt.broadcast_shape(
                 init_dist_batch_shape, innovation_batch_shape, arrays_are_shapes=True
             )
 
@@ -195,12 +195,12 @@ def rv_op(cls, init_dist, innovation_dist, steps, size=None):
         # done directly on top of a RandomVariable. Because of this we dimshuffle the
         # distributions and only then concatenate them, instead of the other way around.
         # shape = (B, 1, S)
-        init_dist_dimswapped_ = at.moveaxis(init_dist_, 0, -ndim_supp)
+        init_dist_dimswapped_ = pt.moveaxis(init_dist_, 0, -ndim_supp)
         # shape = (B, T-1, S)
-        innovation_dist_dimswapped_ = at.moveaxis(innovation_dist_, 0, -ndim_supp)
+        innovation_dist_dimswapped_ = pt.moveaxis(innovation_dist_, 0, -ndim_supp)
         # shape = (B, T, S)
-        grw_ = at.concatenate([init_dist_dimswapped_, innovation_dist_dimswapped_], axis=-ndim_supp)
-        grw_ = at.cumsum(grw_, axis=-ndim_supp)
+        grw_ = pt.concatenate([init_dist_dimswapped_, innovation_dist_dimswapped_], axis=-ndim_supp)
+        grw_ = pt.cumsum(grw_, axis=-ndim_supp)
         return RandomWalkRV(
             [init_dist_, innovation_dist_, steps_],
             # We pass steps_ through just so we can keep a reference to it, even though
@@ -227,10 +227,10 @@ def random_walk_moment(op, rv, init_dist, innovation_dist, steps):
     # shape = (T-1, B, S)
     innovation_moment = moment(innovation_dist)
     # shape = (T, B, S)
-    grw_moment = at.concatenate([init_moment, innovation_moment], axis=0)
-    grw_moment = at.cumsum(grw_moment, axis=0)
+    grw_moment = pt.concatenate([init_moment, innovation_moment], axis=0)
+    grw_moment = pt.cumsum(grw_moment, axis=0)
     # shape = (B, T, S)
-    grw_moment = at.moveaxis(grw_moment, 0, -op.ndim_supp)
+    grw_moment = pt.moveaxis(grw_moment, 0, -op.ndim_supp)
     return grw_moment
 
 
@@ -258,7 +258,7 @@ def __new__(cls, name, *args, **kwargs):
         return RandomWalk(name, init_dist=init_dist, innovation_dist=innovation_dist, **kwargs)
 
     @classmethod
-    def dist(cls, *args, **kwargs) -> at.TensorVariable:
+    def dist(cls, *args, **kwargs) -> pt.TensorVariable:
         init_dist, innovation_dist, kwargs = cls.get_dists(*args, **kwargs)
         return RandomWalk.dist(init_dist=init_dist, innovation_dist=innovation_dist, **kwargs)
 
@@ -277,7 +277,7 @@ class GaussianRandomWalk(PredefinedRandomWalk):
         innovation drift
     sigma : tensor_like of float, default 1
         sigma > 0, innovation standard deviation.
-    init_dist : Distribution
+    init_dist : unnamed_distribution
         Unnamed univariate distribution of the initial value. Unnamed refers to distributions
         created with the ``.dist()`` API.
 
@@ -305,8 +305,8 @@ def get_dists(cls, mu=0.0, sigma=1.0, *, init_dist=None, **kwargs):
             )
             init_dist = Normal.dist(0, 100)
 
-        mu = at.as_tensor_variable(mu)
-        sigma = at.as_tensor_variable(sigma)
+        mu = pt.as_tensor_variable(mu)
+        sigma = pt.as_tensor_variable(sigma)
         innovation_dist = Normal.dist(mu=mu, sigma=sigma)
 
         return init_dist, innovation_dist, kwargs
@@ -317,19 +317,18 @@ class MvGaussianRandomWalk(PredefinedRandomWalk):
 
     Parameters
     ----------
-    mu: tensor_like of float
+    mu : tensor_like of float
         innovation drift
-    cov: tensor_like of float
+    cov : tensor_like of float
         pos def matrix, innovation covariance matrix
-    tau: tensor_like of float
+    tau : tensor_like of float
         pos def matrix, inverse covariance matrix
-    chol: tensor_like of float
+    chol : tensor_like of float
         Cholesky decomposition of covariance matrix
     lower : bool, default=True
         Whether the cholesky fatcor is given as a lower triangular matrix.
-    init_dist: distribution
-        Unnamed multivariate distribution of the initial value. Unnamed refers to distributions
-         created with the ``.dist()`` API.
+    init_dist : unnamed_distribution
+        Unnamed multivariate distribution of the initial value.
 
          .. warning:: init_dist will be cloned, rendering them independent of the ones passed as input.
 
@@ -359,7 +358,7 @@ def get_dists(cls, mu, *, cov=None, tau=None, chol=None, lower=True, init_dist=N
                 "You can specify an init_dist manually to suppress this warning.",
                 UserWarning,
             )
-            init_dist = MvNormal.dist(at.zeros_like(mu.shape[-1]), at.eye(mu.shape[-1]) * 100)
+            init_dist = MvNormal.dist(pt.zeros_like(mu.shape[-1]), pt.eye(mu.shape[-1]) * 100)
 
         innovation_dist = MvNormal.dist(mu=mu, cov=cov, tau=tau, chol=chol, lower=lower)
         return init_dist, innovation_dist, kwargs
@@ -370,21 +369,20 @@ class MvStudentTRandomWalk(PredefinedRandomWalk):
 
     Parameters
     ----------
-    nu: int
+    nu : int
         degrees of freedom
-    mu: tensor_like of float
+    mu : tensor_like of float
         innovation drift
-    scale: tensor_like of float
+    scale : tensor_like of float
         pos def matrix, innovation covariance matrix
-    tau: tensor_like of float
+    tau : tensor_like of float
         pos def matrix, inverse covariance matrix
-    chol: tensor_like of float
+    chol : tensor_like of float
         Cholesky decomposition of covariance matrix
     lower : bool, default=True
         Whether the cholesky fatcor is given as a lower triangular matrix.
-    init_dist: distribution
-        Unnamed multivariate distribution of the initial value. Unnamed refers to distributions
-         created with the ``.dist()`` API.
+    init_dist : unnamed_distribution
+        Unnamed multivariate distribution of the initial value.
 
          .. warning:: init_dist will be cloned, rendering them independent of the ones passed as input.
 
@@ -416,7 +414,7 @@ def get_dists(
                 "You can specify an init_dist manually to suppress this warning.",
                 UserWarning,
             )
-            init_dist = MvNormal.dist(at.zeros_like(mu.shape[-1]), at.eye(mu.shape[-1]) * 100)
+            init_dist = MvNormal.dist(pt.zeros_like(mu.shape[-1]), pt.eye(mu.shape[-1]) * 100)
 
         innovation_dist = MvStudentT.dist(
             nu=nu, mu=mu, scale=scale, tau=tau, chol=chol, lower=lower, cov=kwargs.pop("cov", None)
@@ -471,9 +469,8 @@ class AR(Distribution):
     constant : bool, default False
         Whether the first element of rho should be used as a constant term in the AR
         process.
-    init_dist : unnamed distribution, optional
-        Scalar or vector distribution for initial values. Unnamed refers to distributions
-        created with the ``.dist()`` API. Distributions should have shape (*shape[:-1], ar_order).
+    init_dist : unnamed_distribution, optional
+        Scalar or vector distribution for initial values. Distributions should have shape (*shape[:-1], ar_order).
         If not, it will be automatically resized. Defaults to pm.Normal.dist(0, 100, shape=...).
 
         .. warning:: init_dist will be cloned, rendering it independent of the one passed as input.
@@ -506,7 +503,7 @@ class AR(Distribution):
     rv_type = AutoRegressiveRV
 
     def __new__(cls, name, rho, *args, steps=None, constant=False, ar_order=None, **kwargs):
-        rhos = at.atleast_1d(at.as_tensor_variable(floatX(rho)))
+        rhos = pt.atleast_1d(pt.as_tensor_variable(floatX(rho)))
         ar_order = cls._get_ar_order(rhos=rhos, constant=constant, ar_order=ar_order)
         steps = get_support_shape_1d(
             support_shape=steps,
@@ -533,8 +530,8 @@ def dist(
         **kwargs,
     ):
         _, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        sigma = at.as_tensor_variable(floatX(sigma))
-        rhos = at.atleast_1d(at.as_tensor_variable(floatX(rho)))
+        sigma = pt.as_tensor_variable(floatX(sigma))
+        rhos = pt.atleast_1d(pt.as_tensor_variable(floatX(rho)))
 
         if "init" in kwargs:
             warnings.warn(
@@ -549,7 +546,7 @@ def dist(
         )
         if steps is None:
             raise ValueError("Must specify steps or shape parameter")
-        steps = at.as_tensor_variable(intX(steps), ndim=0)
+        steps = pt.as_tensor_variable(intX(steps), ndim=0)
 
         if init_dist is not None:
             if not isinstance(init_dist, TensorVariable) or not isinstance(
@@ -621,7 +618,7 @@ def rv_op(cls, rhos, sigma, init_dist, steps, ar_order, constant_term, size=None
         else:
             # In this case the size of the init_dist depends on the parameters shape
             # The last dimension of rho and init_dist does not matter
-            batch_size = at.broadcast_shape(sigma, rhos[..., 0], at.atleast_1d(init_dist)[..., 0])
+            batch_size = pt.broadcast_shape(sigma, rhos[..., 0], pt.atleast_1d(init_dist)[..., 0])
         if init_dist.owner.op.ndim_supp == 0:
             init_dist_size = (*batch_size, ar_order)
         else:
@@ -640,16 +637,16 @@ def rv_op(cls, rhos, sigma, init_dist, steps, ar_order, constant_term, size=None
         if constant_term:
             # In this case init shape is one unit smaller than rhos in the last dimension
             rhos_bcast_shape_ = (*rhos_bcast_shape_[:-1], rhos_bcast_shape_[-1] + 1)
-        rhos_bcast_ = at.broadcast_to(rhos_, rhos_bcast_shape_)
+        rhos_bcast_ = pt.broadcast_to(rhos_, rhos_bcast_shape_)
 
         noise_rng = pytensor.shared(np.random.default_rng())
 
         def step(*args):
             *prev_xs, reversed_rhos, sigma, rng = args
             if constant_term:
-                mu = reversed_rhos[-1] + at.sum(prev_xs * reversed_rhos[:-1], axis=0)
+                mu = reversed_rhos[-1] + pt.sum(prev_xs * reversed_rhos[:-1], axis=0)
             else:
-                mu = at.sum(prev_xs * reversed_rhos, axis=0)
+                mu = pt.sum(prev_xs * reversed_rhos, axis=0)
             next_rng, new_x = Normal.dist(mu=mu, sigma=sigma, rng=rng).owner.outputs
             return new_x, {rng: next_rng}
 
@@ -662,7 +659,7 @@ def step(*args):
             strict=True,
         )
         (noise_next_rng,) = tuple(innov_updates_.values())
-        ar_ = at.concatenate([init_, innov_.T], axis=-1)
+        ar_ = pt.concatenate([init_, innov_.T], axis=-1)
 
         ar_op = AutoRegressiveRV(
             inputs=[rhos_, sigma_, init_, steps_],
@@ -699,7 +696,7 @@ def ar_logp(op, values, rhos, sigma, init_dist, steps, noise_rng, **kwargs):
 
     # Convolve rhos with values
     if constant_term:
-        expectation = at.add(
+        expectation = pt.add(
             rhos[..., 0, None],
             *(
                 rhos[..., i + 1, None] * value[..., ar_order - (i + 1) : -(i + 1)]
@@ -707,26 +704,26 @@ def ar_logp(op, values, rhos, sigma, init_dist, steps, noise_rng, **kwargs):
             ),
         )
     else:
-        expectation = at.add(
+        expectation = pt.add(
             *(
                 rhos[..., i, None] * value[..., ar_order - (i + 1) : -(i + 1)]
                 for i in range(ar_order)
             )
         )
     # Compute and collapse logp across time dimension
-    innov_logp = at.sum(
+    innov_logp = pt.sum(
         logp(Normal.dist(0, sigma[..., None]), value[..., ar_order:] - expectation), axis=-1
     )
     init_logp = logp(init_dist, value[..., :ar_order])
     if init_dist.owner.op.ndim_supp == 0:
-        init_logp = at.sum(init_logp, axis=-1)
+        init_logp = pt.sum(init_logp, axis=-1)
     return init_logp + innov_logp
 
 
 @_moment.register(AutoRegressiveRV)
 def ar_moment(op, rv, rhos, sigma, init_dist, steps, noise_rng):
     # Use last entry of init_dist moment as the moment for the whole AR
-    return at.full_like(rv, moment(init_dist)[..., -1, None])
+    return pt.full_like(rv, moment(init_dist)[..., -1, None])
 
 
 class GARCH11RV(SymbolicRandomVariable):
@@ -755,13 +752,13 @@ class GARCH11(Distribution):
 
     Parameters
     ----------
-    omega: tensor
+    omega : tensor_like of float
         omega > 0, mean variance
-    alpha_1: tensor
+    alpha_1 : tensor_like of float
         alpha_1 >= 0, autoregressive term coefficient
-    beta_1: tensor
+    beta_1 : tensor_like of float
         beta_1 >= 0, alpha_1 + beta_1 < 1, moving average term coefficient
-    initial_vol: tensor
+    initial_vol : tensor_like of float
         initial_vol >= 0, initial volatility, sigma_0
     """
 
@@ -784,12 +781,12 @@ def dist(cls, omega, alpha_1, beta_1, initial_vol, *, steps=None, **kwargs):
         )
         if steps is None:
             raise ValueError("Must specify steps or shape parameter")
-        steps = at.as_tensor_variable(intX(steps), ndim=0)
+        steps = pt.as_tensor_variable(intX(steps), ndim=0)
 
-        omega = at.as_tensor_variable(omega)
-        alpha_1 = at.as_tensor_variable(alpha_1)
-        beta_1 = at.as_tensor_variable(beta_1)
-        initial_vol = at.as_tensor_variable(initial_vol)
+        omega = pt.as_tensor_variable(omega)
+        alpha_1 = pt.as_tensor_variable(alpha_1)
+        beta_1 = pt.as_tensor_variable(beta_1)
+        initial_vol = pt.as_tensor_variable(initial_vol)
 
         init_dist = Normal.dist(0, initial_vol)
         # We can ignore init_dist, as it will be accounted for in the logp term
@@ -803,9 +800,9 @@ def rv_op(cls, omega, alpha_1, beta_1, initial_vol, init_dist, steps, size=None)
             batch_size = size
         else:
             # In this case the size of the init_dist depends on the parameters shape
-            batch_size = at.broadcast_shape(omega, alpha_1, beta_1, initial_vol)
+            batch_size = pt.broadcast_shape(omega, alpha_1, beta_1, initial_vol)
         init_dist = change_dist_size(init_dist, batch_size)
-        # initial_vol = initial_vol * at.ones(batch_size)
+        # initial_vol = initial_vol * pt.ones(batch_size)
 
         # Create OpFromGraph representing random draws from GARCH11 process
         # Variables with underscore suffix are dummy inputs into the OpFromGraph
@@ -819,22 +816,22 @@ def rv_op(cls, omega, alpha_1, beta_1, initial_vol, init_dist, steps, size=None)
         noise_rng = pytensor.shared(np.random.default_rng())
 
         def step(prev_y, prev_sigma, omega, alpha_1, beta_1, rng):
-            new_sigma = at.sqrt(
-                omega + alpha_1 * at.square(prev_y) + beta_1 * at.square(prev_sigma)
+            new_sigma = pt.sqrt(
+                omega + alpha_1 * pt.square(prev_y) + beta_1 * pt.square(prev_sigma)
             )
             next_rng, new_y = Normal.dist(mu=0, sigma=new_sigma, rng=rng).owner.outputs
             return (new_y, new_sigma), {rng: next_rng}
 
         (y_t, _), innov_updates_ = pytensor.scan(
             fn=step,
-            outputs_info=[init_, initial_vol_ * at.ones(batch_size)],
+            outputs_info=[init_, initial_vol_ * pt.ones(batch_size)],
             non_sequences=[omega_, alpha_1_, beta_1_, noise_rng],
             n_steps=steps_,
             strict=True,
         )
         (noise_next_rng,) = tuple(innov_updates_.values())
 
-        garch11_ = at.concatenate([init_[None, ...], y_t], axis=0).dimshuffle(
+        garch11_ = pt.concatenate([init_[None, ...], y_t], axis=0).dimshuffle(
             tuple(range(1, y_t.ndim)) + (0,)
         )
 
@@ -867,10 +864,10 @@ def garch11_logp(
     (value,) = values
     # Move the time axis to the first dimension
     value_dimswapped = value.dimshuffle((value.ndim - 1,) + tuple(range(0, value.ndim - 1)))
-    initial_vol = initial_vol * at.ones_like(value_dimswapped[0])
+    initial_vol = initial_vol * pt.ones_like(value_dimswapped[0])
 
     def volatility_update(x, vol, w, a, b):
-        return at.sqrt(w + a * at.square(x) + b * at.square(vol))
+        return pt.sqrt(w + a * pt.square(x) + b * pt.square(vol))
 
     vol, _ = pytensor.scan(
         fn=volatility_update,
@@ -879,16 +876,16 @@ def volatility_update(x, vol, w, a, b):
         non_sequences=[omega, alpha_1, beta_1],
         strict=True,
     )
-    sigma_t = at.concatenate([[initial_vol], vol])
+    sigma_t = pt.concatenate([[initial_vol], vol])
     # Compute and collapse logp across time dimension
-    innov_logp = at.sum(logp(Normal.dist(0, sigma_t), value_dimswapped), axis=0)
+    innov_logp = pt.sum(logp(Normal.dist(0, sigma_t), value_dimswapped), axis=0)
     return innov_logp
 
 
 @_moment.register(GARCH11RV)
 def garch11_moment(op, rv, omega, alpha_1, beta_1, initial_vol, init_dist, steps, noise_rng):
     # GARCH(1,1) mean is zero
-    return at.zeros_like(rv)
+    return pt.zeros_like(rv)
 
 
 class EulerMaruyamaRV(SymbolicRandomVariable):
@@ -922,9 +919,8 @@ class EulerMaruyama(Distribution):
         function returning the drift and diffusion coefficients of SDE
     sde_pars : tuple
         parameters of the SDE, passed as ``*args`` to ``sde_fn``
-    init_dist : unnamed distribution, optional
-        Scalar distribution for initial values. Unnamed refers to distributions created with
-        the ``.dist()`` API. Distributions should have shape (*shape[:-1]).
+    init_dist : unnamed_distribution, optional
+        Scalar distribution for initial values. Distributions should have shape (*shape[:-1]).
         If not, it will be automatically resized. Defaults to pm.Normal.dist(0, 100, shape=...).
 
         .. warning:: init_dist will be cloned, rendering it independent of the one passed as input.
@@ -933,7 +929,7 @@ class EulerMaruyama(Distribution):
     rv_type = EulerMaruyamaRV
 
     def __new__(cls, name, dt, sde_fn, *args, steps=None, **kwargs):
-        dt = at.as_tensor_variable(floatX(dt))
+        dt = pt.as_tensor_variable(floatX(dt))
         steps = get_support_shape_1d(
             support_shape=steps,
             shape=None,  # Shape will be checked in `cls.dist`
@@ -950,10 +946,10 @@ def dist(cls, dt, sde_fn, sde_pars, *, init_dist=None, steps=None, **kwargs):
         )
         if steps is None:
             raise ValueError("Must specify steps or shape parameter")
-        steps = at.as_tensor_variable(intX(steps), ndim=0)
+        steps = pt.as_tensor_variable(intX(steps), ndim=0)
 
-        dt = at.as_tensor_variable(floatX(dt))
-        sde_pars = [at.as_tensor_variable(x) for x in sde_pars]
+        dt = pt.as_tensor_variable(floatX(dt))
+        sde_pars = [pt.as_tensor_variable(x) for x in sde_pars]
 
         if init_dist is not None:
             if not isinstance(init_dist, TensorVariable) or not isinstance(
@@ -988,7 +984,7 @@ def rv_op(cls, init_dist, steps, sde_pars, dt, sde_fn, size=None):
         if size is not None:
             batch_size = size
         else:
-            batch_size = at.broadcast_shape(*sde_pars, init_dist)
+            batch_size = pt.broadcast_shape(*sde_pars, init_dist)
         init_dist = change_dist_size(init_dist, batch_size)
 
         # Create OpFromGraph representing random draws from SDE process
@@ -1003,7 +999,7 @@ def step(*prev_args):
             prev_y, *prev_sde_pars, rng = prev_args
             f, g = sde_fn(prev_y, *prev_sde_pars)
             mu = prev_y + dt * f
-            sigma = at.sqrt(dt) * g
+            sigma = pt.sqrt(dt) * g
             next_rng, next_y = Normal.dist(mu=mu, sigma=sigma, rng=rng).owner.outputs
             return next_y, {rng: next_rng}
 
@@ -1016,7 +1012,7 @@ def step(*prev_args):
         )
         (noise_next_rng,) = tuple(innov_updates_.values())
 
-        sde_out_ = at.concatenate([init_[None, ...], y_t], axis=0).dimshuffle(
+        sde_out_ = pt.concatenate([init_[None, ...], y_t], axis=0).dimshuffle(
             tuple(range(1, y_t.ndim)) + (0,)
         )
 
@@ -1062,8 +1058,8 @@ def eulermaruyama_logp(op, values, init_dist, steps, *sde_pars_noise_arg, **kwar
     xt = x[..., 1:]
     f, g = op.sde_fn(xtm1, *sde_pars_broadcast)
     mu = xtm1 + op.dt * f
-    sigma = at.sqrt(op.dt) * g
+    sigma = pt.sqrt(op.dt) * g
     # Compute and collapse logp across time dimension
-    sde_logp = at.sum(logp(Normal.dist(mu, sigma), xt), axis=-1)
+    sde_logp = pt.sum(logp(Normal.dist(mu, sigma), xt), axis=-1)
     init_logp = logp(init_dist, x[..., 0])
     return init_logp + sde_logp
diff --git a/pymc/distributions/transforms.py b/pymc/distributions/transforms.py
index ad4745354b..b873eba235 100644
--- a/pymc/distributions/transforms.py
+++ b/pymc/distributions/transforms.py
@@ -14,7 +14,7 @@
 from functools import singledispatch
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 # ignore mypy error because it somehow considers that
 # "numpy.core.numeric has no attribute normalize_axis_tuple"
@@ -60,7 +60,7 @@ class LogExpM1(RVTransform):
     name = "log_exp_m1"
 
     def backward(self, value, *inputs):
-        return at.softplus(value)
+        return pt.softplus(value)
 
     def forward(self, value, *inputs):
         """Inverse operation of softplus.
@@ -68,10 +68,10 @@ def forward(self, value, *inputs):
         y = Log(Exp(x) - 1)
           = Log(1 - Exp(-x)) + x
         """
-        return at.log(1.0 - at.exp(-value)) + value
+        return pt.log(1.0 - pt.exp(-value)) + value
 
     def log_jac_det(self, value, *inputs):
-        return -at.softplus(-value)
+        return -pt.softplus(-value)
 
 
 class Ordered(RVTransform):
@@ -86,22 +86,22 @@ def __init__(self, ndim_supp=0):
         self.ndim_supp = ndim_supp
 
     def backward(self, value, *inputs):
-        x = at.zeros(value.shape)
-        x = at.inc_subtensor(x[..., 0], value[..., 0])
-        x = at.inc_subtensor(x[..., 1:], at.exp(value[..., 1:]))
-        return at.cumsum(x, axis=-1)
+        x = pt.zeros(value.shape)
+        x = pt.inc_subtensor(x[..., 0], value[..., 0])
+        x = pt.inc_subtensor(x[..., 1:], pt.exp(value[..., 1:]))
+        return pt.cumsum(x, axis=-1)
 
     def forward(self, value, *inputs):
-        y = at.zeros(value.shape)
-        y = at.inc_subtensor(y[..., 0], value[..., 0])
-        y = at.inc_subtensor(y[..., 1:], at.log(value[..., 1:] - value[..., :-1]))
+        y = pt.zeros(value.shape)
+        y = pt.inc_subtensor(y[..., 0], value[..., 0])
+        y = pt.inc_subtensor(y[..., 1:], pt.log(value[..., 1:] - value[..., :-1]))
         return y
 
     def log_jac_det(self, value, *inputs):
         if self.ndim_supp == 0:
-            return at.sum(value[..., 1:], axis=-1, keepdims=True)
+            return pt.sum(value[..., 1:], axis=-1, keepdims=True)
         else:
-            return at.sum(value[..., 1:], axis=-1)
+            return pt.sum(value[..., 1:], axis=-1)
 
 
 class SumTo1(RVTransform):
@@ -121,18 +121,18 @@ def __init__(self, ndim_supp=0):
         self.ndim_supp = ndim_supp
 
     def backward(self, value, *inputs):
-        remaining = 1 - at.sum(value[..., :], axis=-1, keepdims=True)
-        return at.concatenate([value[..., :], remaining], axis=-1)
+        remaining = 1 - pt.sum(value[..., :], axis=-1, keepdims=True)
+        return pt.concatenate([value[..., :], remaining], axis=-1)
 
     def forward(self, value, *inputs):
         return value[..., :-1]
 
     def log_jac_det(self, value, *inputs):
-        y = at.zeros(value.shape)
+        y = pt.zeros(value.shape)
         if self.ndim_supp == 0:
-            return at.sum(y, axis=-1, keepdims=True)
+            return pt.sum(y, axis=-1, keepdims=True)
         else:
-            return at.sum(y, axis=-1)
+            return pt.sum(y, axis=-1)
 
 
 class CholeskyCovPacked(RVTransform):
@@ -151,16 +151,16 @@ def __init__(self, n):
         n: int
             Number of diagonal entries in the LKJCholeskyCov distribution
         """
-        self.diag_idxs = at.arange(1, n + 1).cumsum() - 1
+        self.diag_idxs = pt.arange(1, n + 1).cumsum() - 1
 
     def backward(self, value, *inputs):
-        return at.set_subtensor(value[..., self.diag_idxs], at.exp(value[..., self.diag_idxs]))
+        return pt.set_subtensor(value[..., self.diag_idxs], pt.exp(value[..., self.diag_idxs]))
 
     def forward(self, value, *inputs):
-        return at.set_subtensor(value[..., self.diag_idxs], at.log(value[..., self.diag_idxs]))
+        return pt.set_subtensor(value[..., self.diag_idxs], pt.log(value[..., self.diag_idxs]))
 
     def log_jac_det(self, value, *inputs):
-        return at.sum(value[..., self.diag_idxs], axis=-1)
+        return pt.sum(value[..., self.diag_idxs], axis=-1)
 
 
 class Chain(RVTransform):
@@ -185,7 +185,7 @@ def backward(self, value, *inputs):
         return x
 
     def log_jac_det(self, value, *inputs):
-        y = at.as_tensor_variable(value)
+        y = pt.as_tensor_variable(value)
         det_list = []
         ndim0 = y.ndim
         for transf in reversed(self.transform_list):
@@ -276,7 +276,7 @@ def __init__(self, lower=None, upper=None, *, bounds_fn=None):
         if bounds_fn is None:
             try:
                 bounds = tuple(
-                    None if bound is None else at.constant(bound, ndim=0).data
+                    None if bound is None else pt.constant(bound, ndim=0).data
                     for bound in (lower, upper)
                 )
             except (ValueError, TypeError):
@@ -326,16 +326,16 @@ def backward(self, value, *rv_inputs):
         return value
 
     def log_jac_det(self, value, *rv_inputs):
-        return at.constant(0.0)
+        return pt.constant(0.0)
 
 
 def extend_axis(array, axis):
     n = array.shape[axis] + 1
     sum_vals = array.sum(axis, keepdims=True)
-    norm = sum_vals / (at.sqrt(n) + n)
-    fill_val = norm - sum_vals / at.sqrt(n)
+    norm = sum_vals / (pt.sqrt(n) + n)
+    fill_val = norm - sum_vals / pt.sqrt(n)
 
-    out = at.concatenate([array, fill_val], axis=axis)
+    out = pt.concatenate([array, fill_val], axis=axis)
     return out - norm
 
 
@@ -343,10 +343,10 @@ def extend_axis_rev(array, axis):
     normalized_axis = normalize_axis_tuple(axis, array.ndim)[0]
 
     n = array.shape[normalized_axis]
-    last = at.take(array, [-1], axis=normalized_axis)
+    last = pt.take(array, [-1], axis=normalized_axis)
 
-    sum_vals = -last * at.sqrt(n)
-    norm = sum_vals / (at.sqrt(n) + n)
+    sum_vals = -last * pt.sqrt(n)
+    norm = sum_vals / (pt.sqrt(n) + n)
     slice_before = (slice(None, None),) * normalized_axis
 
     return array[slice_before + (slice(None, -1),)] + norm
diff --git a/pymc/distributions/truncated.py b/pymc/distributions/truncated.py
index e15427bb9b..05616fbe4c 100644
--- a/pymc/distributions/truncated.py
+++ b/pymc/distributions/truncated.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor import scan
 from pytensor.graph import Op
@@ -37,7 +37,8 @@
 from pymc.distributions.shape_utils import _change_dist_size, change_dist_size, to_tuple
 from pymc.distributions.transforms import _default_transform
 from pymc.exceptions import TruncationError
-from pymc.logprob.abstract import MeasurableVariable, _logcdf, _logprob, icdf, logcdf
+from pymc.logprob.abstract import MeasurableVariable, _logcdf, _logprob
+from pymc.logprob.basic import icdf, logcdf
 from pymc.math import logdiffexp
 from pymc.util import check_dist_not_registered
 
@@ -167,11 +168,11 @@ def rv_op(cls, dist, lower, upper, max_n_steps, size=None):
         except NotImplementedError:
             pass
 
-        lower = at.as_tensor_variable(lower) if lower is not None else at.constant(-np.inf)
-        upper = at.as_tensor_variable(upper) if upper is not None else at.constant(np.inf)
+        lower = pt.as_tensor_variable(lower) if lower is not None else pt.constant(-np.inf)
+        upper = pt.as_tensor_variable(upper) if upper is not None else pt.constant(np.inf)
 
         if size is None:
-            size = at.broadcast_shape(dist, lower, upper)
+            size = pt.broadcast_shape(dist, lower, upper)
         dist = change_dist_size(dist, new_size=size)
 
         # Variables with `_` suffix identify dummy inputs for the OpFromGraph
@@ -189,11 +190,11 @@ def rv_op(cls, dist, lower, upper, max_n_steps, size=None):
             # For left truncated discrete RVs, we need to include the whole lower bound.
             # This may result in draws below the truncation range, if any uniform == 0
             lower_value = lower_ - 1 if dist.owner.op.dtype.startswith("int") else lower_
-            cdf_lower_ = at.exp(logcdf(rv_, lower_value))
-            cdf_upper_ = at.exp(logcdf(rv_, upper_))
+            cdf_lower_ = pt.exp(logcdf(rv_, lower_value))
+            cdf_upper_ = pt.exp(logcdf(rv_, upper_))
             # It's okay to reuse the same rng here, because the rng in rv_ will not be
             # used by either the logcdf of icdf functions
-            uniform_ = at.random.uniform(
+            uniform_ = pt.random.uniform(
                 cdf_lower_,
                 cdf_upper_,
                 rng=rng,
@@ -213,23 +214,23 @@ def rv_op(cls, dist, lower, upper, max_n_steps, size=None):
         # Fallback to rejection sampling
         def loop_fn(truncated_rv, reject_draws, lower, upper, rng, *rv_inputs):
             next_rng, new_truncated_rv = dist.owner.op.make_node(rng, *rv_inputs).outputs
-            truncated_rv = at.set_subtensor(
+            truncated_rv = pt.set_subtensor(
                 truncated_rv[reject_draws],
                 new_truncated_rv[reject_draws],
             )
-            reject_draws = at.or_((truncated_rv < lower), (truncated_rv > upper))
+            reject_draws = pt.or_((truncated_rv < lower), (truncated_rv > upper))
 
             return (
                 (truncated_rv, reject_draws),
                 [(rng, next_rng)],
-                until(~at.any(reject_draws)),
+                until(~pt.any(reject_draws)),
             )
 
         (truncated_rv_, reject_draws_), updates = scan(
             loop_fn,
             outputs_info=[
-                at.zeros_like(rv_),
-                at.ones_like(rv_, dtype=bool),
+                pt.zeros_like(rv_),
+                pt.ones_like(rv_, dtype=bool),
             ],
             non_sequences=[lower_, upper_, rng, *rv_inputs_],
             n_steps=max_n_steps,
@@ -237,7 +238,7 @@ def loop_fn(truncated_rv, reject_draws, lower, upper, rng, *rv_inputs):
         )
 
         truncated_rv_ = truncated_rv_[-1]
-        convergence_ = ~at.any(reject_draws_[-1])
+        convergence_ = ~pt.any(reject_draws_[-1])
         truncated_rv_ = TruncationCheck(f"Truncation did not converge in {max_n_steps} steps")(
             truncated_rv_, convergence_
         )
@@ -276,18 +277,18 @@ def truncated_moment(op, rv, *inputs):
     untruncated_rv = op.base_rv_op.make_node(rng, *rv_inputs).default_output()
     untruncated_moment = moment(untruncated_rv)
 
-    fallback_moment = at.switch(
-        at.and_(at.bitwise_not(at.isinf(lower)), at.bitwise_not(at.isinf(upper))),
+    fallback_moment = pt.switch(
+        pt.and_(pt.bitwise_not(pt.isinf(lower)), pt.bitwise_not(pt.isinf(upper))),
         (upper - lower) / 2,  # lower and upper are finite
-        at.switch(
-            at.isinf(upper),
+        pt.switch(
+            pt.isinf(upper),
             lower + 1,  # only lower is finite
             upper - 1,  # only upper is finite
         ),
     )
 
-    return at.switch(
-        at.and_(at.ge(untruncated_moment, lower), at.le(untruncated_moment, upper)),
+    return pt.switch(
+        pt.and_(pt.ge(untruncated_moment, lower), pt.le(untruncated_moment, upper)),
         untruncated_moment,  # untruncated moment is between lower and upper
         fallback_moment,
     )
@@ -329,22 +330,22 @@ def truncated_logprob(op, values, *inputs, **kwargs):
     if is_lower_bounded and is_upper_bounded:
         lognorm = logdiffexp(upper_logcdf, lower_logcdf)
     elif is_lower_bounded:
-        lognorm = at.log1mexp(lower_logcdf)
+        lognorm = pt.log1mexp(lower_logcdf)
     elif is_upper_bounded:
         lognorm = upper_logcdf
 
     logp = logp - lognorm
 
     if is_lower_bounded:
-        logp = at.switch(value < lower, -np.inf, logp)
+        logp = pt.switch(value < lower, -np.inf, logp)
 
     if is_upper_bounded:
-        logp = at.switch(value <= upper, logp, -np.inf)
+        logp = pt.switch(value <= upper, logp, -np.inf)
 
     if is_lower_bounded and is_upper_bounded:
         logp = check_parameters(
             logp,
-            at.le(lower, upper),
+            pt.le(lower, upper),
             msg="lower_bound <= upper_bound",
         )
 
diff --git a/pymc/func_utils.py b/pymc/func_utils.py
index cbfe55ac7f..4030f61747 100644
--- a/pymc/func_utils.py
+++ b/pymc/func_utils.py
@@ -14,7 +14,7 @@
 from typing import Callable, Dict, Optional, Union
 
 import numpy as np
-import pytensor.tensor as aet
+import pytensor.tensor as pt
 
 from pytensor.gradient import NullTypeGradError
 from scipy import optimize
@@ -139,7 +139,7 @@ def find_constrained_prior(
             "Feel free to open a pull request on PyMC repo if you really need this feature."
         )
 
-    dist_params = aet.vector("dist_params")
+    dist_params = pt.vector("dist_params")
     params_to_optim = {
         arg_name: dist_params[i] for arg_name, i in zip(init_guess.keys(), range(len(init_guess)))
     }
@@ -159,10 +159,10 @@ def find_constrained_prior(
             "need it."
         )
 
-    target = (aet.exp(logcdf_lower) - mass_below_lower) ** 2
+    target = (pt.exp(logcdf_lower) - mass_below_lower) ** 2
     target_fn = pm.pytensorf.compile_pymc([dist_params], target, allow_input_downcast=True)
 
-    constraint = aet.exp(logcdf_upper) - aet.exp(logcdf_lower)
+    constraint = pt.exp(logcdf_upper) - pt.exp(logcdf_lower)
     constraint_fn = pm.pytensorf.compile_pymc([dist_params], constraint, allow_input_downcast=True)
 
     jac: Union[str, Callable]
diff --git a/pymc/gp/__init__.py b/pymc/gp/__init__.py
index 25fbf7dcf1..99c8023398 100644
--- a/pymc/gp/__init__.py
+++ b/pymc/gp/__init__.py
@@ -22,3 +22,4 @@
     MarginalKron,
     MarginalSparse,
 )
+from pymc.gp.hsgp_approx import HSGP
diff --git a/pymc/gp/cov.py b/pymc/gp/cov.py
index a459ff00f4..134c3f207e 100644
--- a/pymc/gp/cov.py
+++ b/pymc/gp/cov.py
@@ -12,15 +12,16 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import numbers
 import warnings
 
+from collections import Counter
 from functools import reduce
-from numbers import Number
 from operator import add, mul
+from typing import Optional, Sequence
 
 import numpy as np
-import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.basic import Variable
 from pytensor.tensor.sharedvar import TensorSharedVariable
@@ -47,26 +48,10 @@
 ]
 
 
-class Covariance:
-    r"""
-    Base class for all kernels/covariance functions.
-
-    Parameters
-    ----------
-    input_dim: integer
-        The number of input dimensions, or columns of X (or Xs)
-        the kernel will operate on.
-    active_dims: List of integers
-        Indicate which dimension or column of X the covariance
-        function operates on.
+class BaseCovariance:
+    """
+    Base class for kernels/covariance functions.
     """
-
-    def __init__(self, input_dim, active_dims=None):
-        self.input_dim = input_dim
-        if active_dims is None:
-            self.active_dims = np.arange(input_dim)
-        else:
-            self.active_dims = np.asarray(active_dims, int)
 
     def __call__(self, X, Xs=None, diag=False):
         r"""
@@ -89,27 +74,14 @@ def __call__(self, X, Xs=None, diag=False):
     def diag(self, X):
         raise NotImplementedError
 
-    def full(self, X, Xs):
+    def full(self, X, Xs=None):
         raise NotImplementedError
 
-    def _slice(self, X, Xs):
-        xdims = X.shape[-1]
-        if isinstance(xdims, Variable):
-            xdims = xdims.eval()
-        if self.input_dim != xdims:
-            warnings.warn(
-                f"Only {self.input_dim} column(s) out of {xdims} are"
-                " being used to compute the covariance function. If this"
-                " is not intended, increase 'input_dim' parameter to"
-                " the number of columns to use. Ignore otherwise.",
-                UserWarning,
-            )
-        X = at.as_tensor_variable(X[:, self.active_dims])
-        if Xs is not None:
-            Xs = at.as_tensor_variable(Xs[:, self.active_dims])
-        return X, Xs
-
     def __add__(self, other):
+        # If it's a scalar, cast as Constant covariance.  This allows validation for power spectral
+        # density calc.
+        if isinstance(other, numbers.Real):
+            other = Constant(c=other)
         return Add([self, other])
 
     def __mul__(self, other):
@@ -122,19 +94,10 @@ def __rmul__(self, other):
         return self.__mul__(other)
 
     def __pow__(self, other):
-        if (
-            isinstance(other, pytensor.compile.SharedVariable)
-            and other.get_value().squeeze().shape == ()
-        ):
-            other = at.squeeze(other)
-            return Exponentiated(self, other)
-        elif isinstance(other, Number):
-            return Exponentiated(self, other)
-        elif np.asarray(other).squeeze().shape == ():
-            other = np.squeeze(other)
-            return Exponentiated(self, other)
-
-        raise ValueError("A covariance function can only be exponentiated by a scalar value")
+        other = pt.as_tensor_variable(other).squeeze()
+        if not other.ndim == 0:
+            raise ValueError("A covariance function can only be exponentiated by a scalar value")
+        return Exponentiated(self, other)
 
     def __array_wrap__(self, result):
         """
@@ -151,41 +114,126 @@ def __array_wrap__(self, result):
         A = np.zeros((r, c))
         for i in range(r):
             for j in range(c):
-                A[i, j] = result[i, j].factor_list[1]
+                r = result[i, j]._factor_list[1]
+                if isinstance(r, Constant):
+                    # Counteract the elemwise Add edgecase
+                    r = r.c
+                A[i, j] = r
         if isinstance(result[0][0], Add):
-            return result[0][0].factor_list[0] + A
+            return result[0][0]._factor_list[0] + A
         elif isinstance(result[0][0], Prod):
-            return result[0][0].factor_list[0] * A
+            return result[0][0]._factor_list[0] * A
         else:
             raise TypeError(
-                f"Unknown Covariance combination type {result[0][0]}.  Known types are `Add` or `Prod`."
+                f"Unknown Covariance combination type {result[0][0]}.  "
+                "Known types are `Add` or `Prod`."
+            )
+
+
+class Covariance(BaseCovariance):
+    """
+    Base class for kernels/covariance functions with input_dim and active_dims, which excludes
+    kernels like `Constant` and `WhiteNoise`.
+
+    Parameters
+    ----------
+    input_dim: integer
+        The number of input dimensions, or columns of X (or Xs)
+        the kernel will operate on.
+    active_dims: List of integers
+        Indicate which dimension or column of X the covariance
+        function operates on.
+    """
+
+    def __init__(self, input_dim: int, active_dims: Optional[Sequence[int]] = None):
+        self.input_dim = input_dim
+        if active_dims is None:
+            self.active_dims = np.arange(input_dim)
+        else:
+            self.active_dims = np.asarray(active_dims, int)
+
+        if max(self.active_dims) > self.input_dim:
+            raise ValueError("Values in `active_dims` can't be larger than `input_dim`.")
+
+    @property
+    def n_dims(self):
+        """The dimensionality of the input, as taken from the
+        `active_dims`.
+        """
+        # Evaluate lazily in-case this changes.
+        return len(self.active_dims)
+
+    def _slice(self, X, Xs=None):
+        xdims = X.shape[-1]
+        if isinstance(xdims, Variable):
+            xdims = xdims.eval()
+        if self.input_dim != xdims:
+            warnings.warn(
+                f"Only {self.input_dim} column(s) out of {xdims} are"
+                " being used to compute the covariance function. If this"
+                " is not intended, increase 'input_dim' parameter to"
+                " the number of columns to use. Ignore otherwise.",
+                UserWarning,
             )
+        X = pt.as_tensor_variable(X[:, self.active_dims])
+        if Xs is not None:
+            Xs = pt.as_tensor_variable(Xs[:, self.active_dims])
+        return X, Xs
 
 
 class Combination(Covariance):
     def __init__(self, factor_list):
-        input_dim = max(
-            factor.input_dim for factor in factor_list if isinstance(factor, Covariance)
+        """Use constituent factors to get input_dim and active_dims for the Combination covariance."""
+
+        # Check if all input_dim are the same in factor_list
+        input_dims = {factor.input_dim for factor in factor_list if isinstance(factor, Covariance)}
+
+        if len(input_dims) != 1:
+            raise ValueError("All covariances must have the same `input_dim`.")
+        input_dim = input_dims.pop()
+
+        # Union all active_dims sets in factor_list for the combination covariance
+        active_dims = np.sort(
+            np.asarray(
+                list(
+                    set.union(
+                        *[
+                            set(factor.active_dims)
+                            for factor in factor_list
+                            if isinstance(factor, Covariance)
+                        ]
+                    )
+                ),
+                dtype=int,
+            )
         )
-        super().__init__(input_dim=input_dim)
-        self.factor_list = []
+
+        super().__init__(input_dim=input_dim, active_dims=active_dims)
+
+        # Set up combination kernel, flatten out factor_list so that
+        self._factor_list = []
         for factor in factor_list:
             if isinstance(factor, self.__class__):
-                self.factor_list.extend(factor.factor_list)
+                self._factor_list.extend(factor._factor_list)
             else:
-                self.factor_list.append(factor)
+                self._factor_list.append(factor)
 
-    def merge_factors(self, X, Xs=None, diag=False):
+    def _merge_factors_cov(self, X, Xs=None, diag=False):
+        """Called to evaluate either all the sums or all the
+        products of kernels that are possible to evaluate.
+        """
         factor_list = []
-        for factor in self.factor_list:
+        for factor in self._factor_list:
             # make sure diag=True is handled properly
-            if isinstance(factor, Covariance):
+            if isinstance(factor, BaseCovariance):
                 factor_list.append(factor(X, Xs, diag))
+
             elif isinstance(factor, np.ndarray):
                 if np.ndim(factor) == 2 and diag:
                     factor_list.append(np.diag(factor))
                 else:
                     factor_list.append(factor)
+
             elif isinstance(
                 factor,
                 (
@@ -195,22 +243,78 @@ def merge_factors(self, X, Xs=None, diag=False):
                 ),
             ):
                 if factor.ndim == 2 and diag:
-                    factor_list.append(at.diag(factor))
+                    factor_list.append(pt.diag(factor))
                 else:
                     factor_list.append(factor)
+
+            else:
+                factor_list.append(factor)
+
+        return factor_list
+
+    def _merge_factors_psd(self, omega):
+        """Called to evaluatate spectral densities of combination kernels when possible.
+
+        Implements
+        a more restricted set of rules than `_merge_factors_cov` -- just additivity of stationary
+        covariances with defined power spectral densities and multiplication by scalars.  Also, the
+        active_dims for all covariances in the sum must be the same.
+        """
+        factor_list = []
+        for factor in self._factor_list:
+            if isinstance(factor, Covariance):
+                # Allow merging covariances for psd only if active_dims are the same
+                if set(self.active_dims) != set(factor.active_dims):
+                    raise ValueError(
+                        "For power spectral density calculations `active_dims` must be the same "
+                        "for all covariances in the sum."
+                    )
+
+                # If it's a covariance try to calculate the psd
+                try:
+                    factor_list.append(factor.power_spectral_density(omega))
+
+                except (AttributeError, NotImplementedError) as e:
+                    if isinstance(factor, Stationary):
+                        raise NotImplementedError(
+                            f"No power spectral density method has been implemented for {factor}."
+                        ) from e
+
+                    else:
+                        raise ValueError(
+                            "Power spectral densities, `.power_spectral_density(omega)`, can only "
+                            f"be calculated for `Stationary` covariance functions.  {factor} is "
+                            "non-stationary."
+                        ) from e
+
             else:
+                # Otherwise defer the reduction to later
                 factor_list.append(factor)
+
         return factor_list
 
 
 class Add(Combination):
     def __call__(self, X, Xs=None, diag=False):
-        return reduce(add, self.merge_factors(X, Xs, diag))
+        return reduce(add, self._merge_factors_cov(X, Xs, diag))
+
+    def power_spectral_density(self, omega):
+        return reduce(add, self._merge_factors_psd(omega))
 
 
 class Prod(Combination):
     def __call__(self, X, Xs=None, diag=False):
-        return reduce(mul, self.merge_factors(X, Xs, diag))
+        return reduce(mul, self._merge_factors_cov(X, Xs, diag))
+
+    def power_spectral_density(self, omega):
+        check = Counter([isinstance(factor, Covariance) for factor in self._factor_list])
+        if check.get(True) >= 2:
+            raise NotImplementedError(
+                "The power spectral density of products of covariance "
+                "functions is not implemented."
+            )
+
+        return reduce(mul, self._merge_factors_psd(omega))
 
 
 class Exponentiated(Covariance):
@@ -243,7 +347,7 @@ def __init__(self, factor_list):
         self.input_dims = [factor.input_dim for factor in factor_list]
         input_dim = sum(self.input_dims)
         super().__init__(input_dim=input_dim)
-        self.factor_list = factor_list
+        self._factor_list = factor_list
 
     def _split(self, X, Xs):
         indices = np.cumsum(self.input_dims)
@@ -256,11 +360,11 @@ def _split(self, X, Xs):
 
     def __call__(self, X, Xs=None, diag=False):
         X_split, Xs_split = self._split(X, Xs)
-        covs = [cov(x, xs, diag) for cov, x, xs in zip(self.factor_list, X_split, Xs_split)]
+        covs = [cov(x, xs, diag) for cov, x, xs in zip(self._factor_list, X_split, Xs_split)]
         return reduce(mul, covs)
 
 
-class Constant(Covariance):
+class Constant(BaseCovariance):
     r"""
     Constant valued covariance function.
 
@@ -270,17 +374,16 @@ class Constant(Covariance):
     """
 
     def __init__(self, c):
-        super().__init__(1, None)
         self.c = c
 
     def diag(self, X):
-        return at.alloc(self.c, X.shape[0])
+        return pt.alloc(self.c, X.shape[0])
 
     def full(self, X, Xs=None):
         if Xs is None:
-            return at.alloc(self.c, X.shape[0], X.shape[0])
+            return pt.alloc(self.c, X.shape[0], X.shape[0])
         else:
-            return at.alloc(self.c, X.shape[0], Xs.shape[0])
+            return pt.alloc(self.c, X.shape[0], Xs.shape[0])
 
 
 class WhiteNoise(Covariance):
@@ -293,17 +396,16 @@ class WhiteNoise(Covariance):
     """
 
     def __init__(self, sigma):
-        super().__init__(1, None)
         self.sigma = sigma
 
     def diag(self, X):
-        return at.alloc(at.square(self.sigma), X.shape[0])
+        return pt.alloc(pt.square(self.sigma), X.shape[0])
 
     def full(self, X, Xs=None):
         if Xs is None:
-            return at.diag(self.diag(X))
+            return pt.diag(self.diag(X))
         else:
-            return at.alloc(0.0, X.shape[0], Xs.shape[0])
+            return pt.alloc(0.0, X.shape[0], Xs.shape[0])
 
 
 class Circular(Covariance):
@@ -340,25 +442,25 @@ class Circular(Covariance):
 
     def __init__(self, input_dim, period, tau=4, active_dims=None):
         super().__init__(input_dim, active_dims)
-        self.c = at.as_tensor_variable(period / 2)
+        self.c = pt.as_tensor_variable(period / 2)
         self.tau = tau
 
     def dist(self, X, Xs):
         if Xs is None:
-            Xs = at.transpose(X)
+            Xs = pt.transpose(X)
         else:
-            Xs = at.transpose(Xs)
-        return at.abs((X - Xs + self.c) % (self.c * 2) - self.c)
+            Xs = pt.transpose(Xs)
+        return pt.abs((X - Xs + self.c) % (self.c * 2) - self.c)
 
     def weinland(self, t):
-        return (1 + self.tau * t / self.c) * at.clip(1 - t / self.c, 0, np.inf) ** self.tau
+        return (1 + self.tau * t / self.c) * pt.clip(1 - t / self.c, 0, np.inf) ** self.tau
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         return self.weinland(self.dist(X, Xs))
 
     def diag(self, X):
-        return at.alloc(1.0, X.shape[0])
+        return pt.alloc(1.0, X.shape[0])
 
 
 class Stationary(Covariance):
@@ -381,33 +483,36 @@ def __init__(self, input_dim, ls=None, ls_inv=None, active_dims=None):
                 ls = 1.0 / np.asarray(ls_inv)
             else:
                 ls = 1.0 / ls_inv
-        self.ls = at.as_tensor_variable(ls)
+        self.ls = pt.as_tensor_variable(ls)
 
     def square_dist(self, X, Xs):
-        X = at.mul(X, 1.0 / self.ls)
-        X2 = at.sum(at.square(X), 1)
+        X = pt.mul(X, 1.0 / self.ls)
+        X2 = pt.sum(pt.square(X), 1)
         if Xs is None:
-            sqd = -2.0 * at.dot(X, at.transpose(X)) + (
-                at.reshape(X2, (-1, 1)) + at.reshape(X2, (1, -1))
+            sqd = -2.0 * pt.dot(X, pt.transpose(X)) + (
+                pt.reshape(X2, (-1, 1)) + pt.reshape(X2, (1, -1))
             )
         else:
-            Xs = at.mul(Xs, 1.0 / self.ls)
-            Xs2 = at.sum(at.square(Xs), 1)
-            sqd = -2.0 * at.dot(X, at.transpose(Xs)) + (
-                at.reshape(X2, (-1, 1)) + at.reshape(Xs2, (1, -1))
+            Xs = pt.mul(Xs, 1.0 / self.ls)
+            Xs2 = pt.sum(pt.square(Xs), 1)
+            sqd = -2.0 * pt.dot(X, pt.transpose(Xs)) + (
+                pt.reshape(X2, (-1, 1)) + pt.reshape(Xs2, (1, -1))
             )
-        return at.clip(sqd, 0.0, np.inf)
+        return pt.clip(sqd, 0.0, np.inf)
 
     def euclidean_dist(self, X, Xs):
         r2 = self.square_dist(X, Xs)
-        return at.sqrt(r2 + 1e-12)
+        return pt.sqrt(r2 + 1e-12)
 
     def diag(self, X):
-        return at.alloc(1.0, X.shape[0])
+        return pt.alloc(1.0, X.shape[0])
 
     def full(self, X, Xs=None):
         raise NotImplementedError
 
+    def power_spectral_density(self, omega):
+        raise NotImplementedError
+
 
 class Periodic(Stationary):
     r"""
@@ -439,8 +544,8 @@ def full(self, X, Xs=None):
         f1 = X.dimshuffle(0, "x", 1)
         f2 = Xs.dimshuffle("x", 0, 1)
         r = np.pi * (f1 - f2) / self.period
-        r = at.sum(at.square(at.sin(r) / self.ls), 2)
-        return at.exp(-0.5 * r)
+        r = pt.sum(pt.square(pt.sin(r) / self.ls), 2)
+        return pt.exp(-0.5 * r)
 
 
 class ExpQuad(Stationary):
@@ -451,11 +556,27 @@ class ExpQuad(Stationary):
     .. math::
 
        k(x, x') = \mathrm{exp}\left[ -\frac{(x - x')^2}{2 \ell^2} \right]
+
     """
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        return at.exp(-0.5 * self.square_dist(X, Xs))
+        return pt.exp(-0.5 * self.square_dist(X, Xs))
+
+    def power_spectral_density(self, omega):
+        r"""
+        The power spectral density for the ExpQuad kernel is:
+
+        .. math::
+
+           S(\boldsymbol\omega) =
+               (\sqrt(2 \pi)^D \prod_{i}^{D}\ell_i
+                \exp\left( -\frac{1}{2} \sum_{i}^{D}\ell_i^2 \omega_i^{2} \right)
+        """
+        ls = pt.ones(self.n_dims) * self.ls
+        c = pt.power(pt.sqrt(2.0 * np.pi), self.n_dims)
+        exp = pt.exp(-0.5 * pt.dot(pt.square(omega), pt.square(ls)))
+        return c * pt.prod(ls) * exp
 
 
 class RatQuad(Stationary):
@@ -473,7 +594,7 @@ def __init__(self, input_dim, alpha, ls=None, ls_inv=None, active_dims=None):
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        return at.power(
+        return pt.power(
             (1.0 + 0.5 * self.square_dist(X, Xs) * (1.0 / self.alpha)),
             -1.0 * self.alpha,
         )
@@ -493,7 +614,31 @@ class Matern52(Stationary):
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         r = self.euclidean_dist(X, Xs)
-        return (1.0 + np.sqrt(5.0) * r + 5.0 / 3.0 * at.square(r)) * at.exp(-1.0 * np.sqrt(5.0) * r)
+        return (1.0 + np.sqrt(5.0) * r + 5.0 / 3.0 * pt.square(r)) * pt.exp(-1.0 * np.sqrt(5.0) * r)
+
+    def power_spectral_density(self, omega):
+        r"""
+        The power spectral density for the Matern52 kernel is:
+
+        .. math::
+
+           S(\boldsymbol\omega) =
+               \frac{2^D \pi^{\frac{D}{2}} \Gamma(\frac{D+5}{2}) 5^{5/2}}
+                    {\frac{3}{4}\sqrt{\pi}}
+               \prod_{i=1}^{D}\ell_{i}
+               \left(5 + \sum_{i=1}^{D}\ell_{i}^2 \boldsymbol\omega_{i}^{2}\right)^{-\frac{D+5}{2}}
+        """
+        ls = pt.ones(self.n_dims) * self.ls
+        D52 = (self.n_dims + 5) / 2
+        num = (
+            pt.power(2, self.n_dims)
+            * pt.power(np.pi, self.n_dims / 2)
+            * pt.gamma(D52)
+            * pt.power(5, 5 / 2)
+        )
+        den = 0.75 * pt.sqrt(np.pi)
+        pow = pt.power(5.0 + pt.dot(pt.square(omega), pt.square(ls)), -1 * D52)
+        return (num / den) * pt.prod(ls) * pow
 
 
 class Matern32(Stationary):
@@ -509,7 +654,31 @@ class Matern32(Stationary):
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         r = self.euclidean_dist(X, Xs)
-        return (1.0 + np.sqrt(3.0) * r) * at.exp(-np.sqrt(3.0) * r)
+        return (1.0 + np.sqrt(3.0) * r) * pt.exp(-np.sqrt(3.0) * r)
+
+    def power_spectral_density(self, omega):
+        r"""
+        The power spectral density for the Matern32 kernel is:
+
+        .. math::
+
+            S(\boldsymbol\omega) =
+                \frac{2^D \pi^{D/2} \Gamma\left(\frac{D+3}{2}\right) 3^{3/2}}
+                     {\frac{1}{2}\sqrt{\pi}}
+               \prod_{i=1}^{D}\ell_{i}
+               \left(3 + \sum_{i=1}^{D}\ell_{i}^2 \boldsymbol\omega_{i}^{2}\right)^{-\frac{D+3}{2}}
+        """
+        ls = pt.ones(self.n_dims) * self.ls
+        D32 = (self.n_dims + 3) / 2
+        num = (
+            pt.power(2, self.n_dims)
+            * pt.power(np.pi, self.n_dims / 2)
+            * pt.gamma(D32)
+            * pt.power(3, 3 / 2)
+        )
+        den = 0.5 * pt.sqrt(np.pi)
+        pow = pt.power(3.0 + pt.dot(pt.square(omega), pt.square(ls)), -1 * D32)
+        return (num / den) * pt.prod(ls) * pow
 
 
 class Matern12(Stationary):
@@ -524,7 +693,7 @@ class Matern12(Stationary):
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         r = self.euclidean_dist(X, Xs)
-        return at.exp(-r)
+        return pt.exp(-r)
 
 
 class Exponential(Stationary):
@@ -538,7 +707,7 @@ class Exponential(Stationary):
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        return at.exp(-0.5 * self.euclidean_dist(X, Xs))
+        return pt.exp(-0.5 * self.euclidean_dist(X, Xs))
 
 
 class Cosine(Stationary):
@@ -551,7 +720,7 @@ class Cosine(Stationary):
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        return at.cos(2.0 * np.pi * self.euclidean_dist(X, Xs))
+        return pt.cos(2.0 * np.pi * self.euclidean_dist(X, Xs))
 
 
 class Linear(Covariance):
@@ -568,20 +737,20 @@ def __init__(self, input_dim, c, active_dims=None):
 
     def _common(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        Xc = at.sub(X, self.c)
+        Xc = pt.sub(X, self.c)
         return X, Xc, Xs
 
     def full(self, X, Xs=None):
         X, Xc, Xs = self._common(X, Xs)
         if Xs is None:
-            return at.dot(Xc, at.transpose(Xc))
+            return pt.dot(Xc, pt.transpose(Xc))
         else:
-            Xsc = at.sub(Xs, self.c)
-            return at.dot(Xc, at.transpose(Xsc))
+            Xsc = pt.sub(Xs, self.c)
+            return pt.dot(Xc, pt.transpose(Xsc))
 
     def diag(self, X):
         X, Xc, _ = self._common(X, None)
-        return at.sum(at.square(Xc), 1)
+        return pt.sum(pt.square(Xc), 1)
 
 
 class Polynomial(Linear):
@@ -599,11 +768,11 @@ def __init__(self, input_dim, c, d, offset, active_dims=None):
 
     def full(self, X, Xs=None):
         linear = super().full(X, Xs)
-        return at.power(linear + self.offset, self.d)
+        return pt.power(linear + self.offset, self.d)
 
     def diag(self, X):
         linear = super().diag(X)
-        return at.power(linear + self.offset, self.d)
+        return pt.power(linear + self.offset, self.d)
 
 
 class WarpedInput(Covariance):
@@ -677,33 +846,33 @@ def __init__(self, input_dim, lengthscale_func, args=None, active_dims=None):
         self.args = args
 
     def square_dist(self, X, Xs=None):
-        X2 = at.sum(at.square(X), 1)
+        X2 = pt.sum(pt.square(X), 1)
         if Xs is None:
-            sqd = -2.0 * at.dot(X, at.transpose(X)) + (
-                at.reshape(X2, (-1, 1)) + at.reshape(X2, (1, -1))
+            sqd = -2.0 * pt.dot(X, pt.transpose(X)) + (
+                pt.reshape(X2, (-1, 1)) + pt.reshape(X2, (1, -1))
             )
         else:
-            Xs2 = at.sum(at.square(Xs), 1)
-            sqd = -2.0 * at.dot(X, at.transpose(Xs)) + (
-                at.reshape(X2, (-1, 1)) + at.reshape(Xs2, (1, -1))
+            Xs2 = pt.sum(pt.square(Xs), 1)
+            sqd = -2.0 * pt.dot(X, pt.transpose(Xs)) + (
+                pt.reshape(X2, (-1, 1)) + pt.reshape(Xs2, (1, -1))
             )
-        return at.clip(sqd, 0.0, np.inf)
+        return pt.clip(sqd, 0.0, np.inf)
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        rx = self.lfunc(at.as_tensor_variable(X), self.args)
+        rx = self.lfunc(pt.as_tensor_variable(X), self.args)
         if Xs is None:
-            rz = self.lfunc(at.as_tensor_variable(X), self.args)
+            rz = self.lfunc(pt.as_tensor_variable(X), self.args)
             r2 = self.square_dist(X, X)
         else:
-            rz = self.lfunc(at.as_tensor_variable(Xs), self.args)
+            rz = self.lfunc(pt.as_tensor_variable(Xs), self.args)
             r2 = self.square_dist(X, Xs)
-        rx2 = at.reshape(at.square(rx), (-1, 1))
-        rz2 = at.reshape(at.square(rz), (1, -1))
-        return at.sqrt((2.0 * at.outer(rx, rz)) / (rx2 + rz2)) * at.exp(-1.0 * r2 / (rx2 + rz2))
+        rx2 = pt.reshape(pt.square(rx), (-1, 1))
+        rz2 = pt.reshape(pt.square(rz), (1, -1))
+        return pt.sqrt((2.0 * pt.outer(rx, rz)) / (rx2 + rz2)) * pt.exp(-1.0 * r2 / (rx2 + rz2))
 
     def diag(self, X):
-        return at.alloc(1.0, X.shape[0])
+        return pt.alloc(1.0, X.shape[0])
 
 
 class ScaledCov(Covariance):
@@ -738,17 +907,17 @@ def __init__(self, input_dim, cov_func, scaling_func, args=None, active_dims=Non
     def diag(self, X):
         X, _ = self._slice(X, None)
         cov_diag = self.cov_func(X, diag=True)
-        scf_diag = at.square(at.flatten(self.scaling_func(X, self.args)))
+        scf_diag = pt.square(pt.flatten(self.scaling_func(X, self.args)))
         return cov_diag * scf_diag
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         scf_x = self.scaling_func(X, self.args)
         if Xs is None:
-            return at.outer(scf_x, scf_x) * self.cov_func(X)
+            return pt.outer(scf_x, scf_x) * self.cov_func(X)
         else:
             scf_xs = self.scaling_func(Xs, self.args)
-            return at.outer(scf_x, scf_xs) * self.cov_func(X, Xs)
+            return pt.outer(scf_x, scf_xs) * self.cov_func(X, Xs)
 
 
 class Coregion(Covariance):
@@ -792,27 +961,27 @@ def __init__(self, input_dim, W=None, kappa=None, B=None, active_dims=None):
         if make_B and B is not None:
             raise ValueError("Exactly one of (W, kappa) and B must be provided to Coregion")
         if make_B:
-            self.W = at.as_tensor_variable(W)
-            self.kappa = at.as_tensor_variable(kappa)
-            self.B = at.dot(self.W, self.W.T) + at.diag(self.kappa)
+            self.W = pt.as_tensor_variable(W)
+            self.kappa = pt.as_tensor_variable(kappa)
+            self.B = pt.dot(self.W, self.W.T) + pt.diag(self.kappa)
         elif B is not None:
-            self.B = at.as_tensor_variable(B)
+            self.B = pt.as_tensor_variable(B)
         else:
             raise ValueError("Exactly one of (W, kappa) and B must be provided to Coregion")
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        index = at.cast(X, "int32")
+        index = pt.cast(X, "int32")
         if Xs is None:
             index2 = index.T
         else:
-            index2 = at.cast(Xs, "int32").T
+            index2 = pt.cast(Xs, "int32").T
         return self.B[index, index2]
 
     def diag(self, X):
         X, _ = self._slice(X, None)
-        index = at.cast(X, "int32")
-        return at.diag(self.B)[index.ravel()]
+        index = pt.cast(X, "int32")
+        return pt.diag(self.B)[index.ravel()]
 
 
 def handle_args(func, args):
diff --git a/pymc/gp/gp.py b/pymc/gp/gp.py
index e6bae18456..e8a695787a 100644
--- a/pymc/gp/gp.py
+++ b/pymc/gp/gp.py
@@ -15,7 +15,7 @@
 import warnings
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.tensor.nlinalg import eigh
 
@@ -111,10 +111,10 @@ class Latent(Base):
 
     Parameters
     ----------
-    cov_func: None, 2D array, or instance of Covariance
-        The covariance function.  Defaults to zero.
-    mean_func: None, instance of Mean
-        The mean function.  Defaults to zero.
+    mean_func : Mean, default ~pymc.gp.mean.Zero
+        The mean function.
+    cov_func : 2D array-like, or Covariance, default ~pymc.gp.cov.Constant
+        The covariance function.
 
     Examples
     --------
@@ -171,18 +171,20 @@ def prior(self, name, X, reparameterize=True, jitter=JITTER_DEFAULT, **kwargs):
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        X: array-like
-            Function input values.
-        reparameterize: bool
+        X : array-like
+            Function input values. If one-dimensional, must be a column
+            vector with shape `(n, 1)`.
+        reparameterize : bool, default True
             Reparameterize the distribution by rotating the random
             variable by the Cholesky factor of the covariance matrix.
-        jitter: scalar
+        jitter : float, default 1e-6
             A small correction added to the diagonal of positive semi-definite
             covariance matrices to ensure numerical stability.
         **kwargs
-            Extra keyword arguments that are passed to distribution constructor.
+            Extra keyword arguments that are passed to :class:`~pymc.MvNormal`
+            distribution constructor.
         """
 
         f = self._build_prior(name, X, reparameterize, jitter, **kwargs)
@@ -211,9 +213,9 @@ def _build_conditional(self, Xnew, X, f, cov_total, mean_total, jitter):
         L = cholesky(stabilize(Kxx, jitter))
         A = solve_lower(L, Kxs)
         v = solve_lower(L, f - mean_total(X))
-        mu = self.mean_func(Xnew) + at.dot(at.transpose(A), v)
+        mu = self.mean_func(Xnew) + pt.dot(pt.transpose(A), v)
         Kss = self.cov_func(Xnew)
-        cov = Kss - at.dot(at.transpose(A), A)
+        cov = Kss - pt.dot(pt.transpose(A), A)
         return mu, cov
 
     def conditional(self, name, Xnew, given=None, jitter=JITTER_DEFAULT, **kwargs):
@@ -233,19 +235,20 @@ def conditional(self, name, Xnew, given=None, jitter=JITTER_DEFAULT, **kwargs):
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        Xnew: array-like
-            Function input values.
-        given: dict
-            Can optionally take as key value pairs: `X`, `y`,
-            and `gp`.  See the section in the documentation on additive GP
-            models in PyMC for more information.
-        jitter: scalar
+        Xnew : array-like
+            Function input values. If one-dimensional, must be a column
+            vector with shape `(n, 1)`.
+        given : dict, optional
+            Can take as key value pairs: `X`, `y`,
+            and `gp`. See the :ref:`section <additive_gp>` in the documentation
+            on additive GP models in pymc for more information.
+        jitter : float, default 1e-6
             A small correction added to the diagonal of positive semi-definite
             covariance matrices to ensure numerical stability.
         **kwargs
-            Extra keyword arguments that are passed to `MvNormal` distribution
+            Extra keyword arguments that are passed to :class:`~pymc.MvNormal` distribution
             constructor.
         """
         givens = self._get_given_vals(given)
@@ -260,7 +263,7 @@ class TP(Latent):
 
     The usage is nearly identical to that of `gp.Latent`.  The differences
     are that it must be initialized with a degrees of freedom parameter, and
-    TP is not additive.  Given a mean and covariance function, and a degrees of
+    TP is not additive. Given a mean and covariance function, and a degrees of
     freedom parameter, the function :math:`f(x)` is modeled as,
 
     .. math::
@@ -270,10 +273,12 @@ class TP(Latent):
 
     Parameters
     ----------
-    scale_func : None, 2D array, or instance of Covariance
-        The scale function.  Defaults to zero.
-    mean_func : None, instance of Mean
-        The mean function.  Defaults to zero.
+    mean_func : Mean, default ~pymc.gp.mean.Zero
+        The mean function.
+    scale_func : 2D array-like, or Covariance, default ~pymc.gp.cov.Constant
+        The covariance function.
+    cov_func : 2D array-like, or Covariance, default None
+        Deprecated, previous version of "scale_func"
     nu : float
         The degrees of freedom
 
@@ -320,15 +325,20 @@ def prior(self, name, X, reparameterize=True, jitter=JITTER_DEFAULT, **kwargs):
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        X: array-like
-            Function input values.
-        reparameterize: bool
+        X : array-like
+            Function input values. If one-dimensional, must be a column
+            vector with shape `(n, 1)`.
+        reparameterize : bool, default True
             Reparameterize the distribution by rotating the random
             variable by the Cholesky factor of the covariance matrix.
+        jitter : float, default 1e-6
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.
         **kwargs
-            Extra keyword arguments that are passed to distribution constructor.
+            Extra keyword arguments that are passed to :class:`~pymc.MvStudentT`
+            distribution constructor.
         """
 
         f = self._build_prior(name, X, reparameterize, jitter, **kwargs)
@@ -342,10 +352,10 @@ def _build_conditional(self, Xnew, X, f, jitter):
         Kss = self.cov_func(Xnew)
         L = cholesky(stabilize(Kxx, jitter))
         A = solve_lower(L, Kxs)
-        cov = Kss - at.dot(at.transpose(A), A)
+        cov = Kss - pt.dot(pt.transpose(A), A)
         v = solve_lower(L, f - self.mean_func(X))
-        mu = self.mean_func(Xnew) + at.dot(at.transpose(A), v)
-        beta = at.dot(v, v)
+        mu = self.mean_func(Xnew) + pt.dot(pt.transpose(A), v)
+        beta = pt.dot(v, v)
         nu2 = self.nu + X.shape[0]
         covT = (self.nu + beta - 2) / (nu2 - 2) * cov
         return nu2, mu, covT
@@ -361,15 +371,16 @@ def conditional(self, name, Xnew, jitter=JITTER_DEFAULT, **kwargs):
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        Xnew: array-like
-            Function input values.
-        jitter: scalar
+        Xnew : array-like
+            Function input values. If one-dimensional, must be a column
+            vector with shape `(n, 1)`.
+        jitter : float, default 1e-6
             A small correction added to the diagonal of positive semi-definite
             covariance matrices to ensure numerical stability.
         **kwargs
-            Extra keyword arguments that are passed to `MvNormal` distribution
+            Extra keyword arguments that are passed to :class:`~pymc.MvStudentT` distribution
             constructor.
         """
 
@@ -388,14 +399,15 @@ class Marginal(Base):
     prior and additive noise.  It has `marginal_likelihood`, `conditional`
     and `predict` methods.  This GP implementation can be used to
     implement regression on data that is normally distributed.  For more
-    information on the `prior` and `conditional` methods, see their docstrings.
+    information on the `marginal_likelihood`, `conditional`
+    and `predict` methods, see their docstrings.
 
     Parameters
     ----------
-    cov_func: None, 2D array, or instance of Covariance
-        The covariance function.  Defaults to zero.
-    mean_func: None, instance of Mean
-        The mean function.  Defaults to zero.
+    mean_func : Mean, default ~pymc.gp.mean.Zero
+        The mean function.
+    cov_func : 2D array-like, or Covariance, default ~pymc.gp.cov.Constant
+        The covariance function.
 
     Examples
     --------
@@ -439,7 +451,7 @@ def marginal_likelihood(
         Returns the marginal likelihood distribution, given the input
         locations `X` and the data `y`.
 
-        This is integral over the product of the GP prior and a normal likelihood.
+        This is the integral over the product of the GP prior and a normal likelihood.
 
         .. math::
 
@@ -447,24 +459,26 @@ def marginal_likelihood(
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        X: array-like
+        X : array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
-        y: array-like
+        y : array-like
             Data that is the sum of the function with the GP prior and Gaussian
             noise.  Must have shape `(n, )`.
-        sigma: scalar, Variable, or Covariance
+        sigma : float, Variable, or Covariance, default ~pymc.gp.cov.WhiteNoise
             Standard deviation of the Gaussian noise.  Can also be a Covariance for
             non-white noise.
-        noise: scalar, Variable, or Covariance
-            Previous parameterization of `sigma`.
-        jitter: scalar
+        noise : float, Variable, or Covariance, optional
+            Deprecated. Previous parameterization of `sigma`.
+        jitter : float, default 1e-6
             A small correction added to the diagonal of positive semi-definite
             covariance matrices to ensure numerical stability.
+        is_observed : bool, default True
+            Deprecated. Whether to set `y` as an `observed` variable in the `model`.
         **kwargs
-            Extra keyword arguments that are passed to `MvNormal` distribution
+            Extra keyword arguments that are passed to :class:`~pymc.MvNormal` distribution
             constructor.
         """
         sigma = _handle_sigma_noise_parameters(sigma=sigma, noise=noise)
@@ -516,16 +530,16 @@ def _build_conditional(
         L = cholesky(stabilize(Kxx, jitter) + Knx)
         A = solve_lower(L, Kxs)
         v = solve_lower(L, rxx)
-        mu = self.mean_func(Xnew) + at.dot(at.transpose(A), v)
+        mu = self.mean_func(Xnew) + pt.dot(pt.transpose(A), v)
         if diag:
             Kss = self.cov_func(Xnew, diag=True)
-            var = Kss - at.sum(at.square(A), 0)
+            var = Kss - pt.sum(pt.square(A), 0)
             if pred_noise:
                 var += noise_func(Xnew, diag=True)
             return mu, var
         else:
             Kss = self.cov_func(Xnew)
-            cov = Kss - at.dot(at.transpose(A), A)
+            cov = Kss - pt.dot(pt.transpose(A), A)
             if pred_noise:
                 cov += noise_func(Xnew)
             return mu, cov if pred_noise else stabilize(cov, jitter)
@@ -548,23 +562,22 @@ def conditional(
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        Xnew: array-like
+        Xnew : array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
-        pred_noise: bool
+        pred_noise : bool, default False
             Whether or not observation noise is included in the conditional.
-            Default is `False`.
-        given: dict
-            Can optionally take as key value pairs: `X`, `y`, `sigma`,
-            and `gp`.  See the section in the documentation on additive GP
-            models in PyMC for more information.
-        jitter: scalar
+        given : dict, optional
+            Can take key value pairs: `X`, `y`, `sigma`,
+            and `gp`. See the :ref:`section <additive_gp>` in the documentation
+            on additive GP models in pymc for more information.
+        jitter : float, default 1e-6
             A small correction added to the diagonal of positive semi-definite
             covariance matrices to ensure numerical stability.
         **kwargs
-            Extra keyword arguments that are passed to `MvNormal` distribution
+            Extra keyword arguments that are passed to :class:`~pymc.MvNormal` distribution
             constructor.
         """
 
@@ -589,22 +602,27 @@ def predict(
 
         Parameters
         ----------
-        Xnew: array-like
+        Xnew : array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
-        point: pymc.model.Point
+        point : pymc.Point, optional
             A specific point to condition on.
-        diag: bool
+        diag : bool, default False
             If `True`, return the diagonal instead of the full covariance
-            matrix.  Default is `False`.
-        pred_noise: bool
+            matrix.
+        pred_noise : bool, default False
             Whether or not observation noise is included in the conditional.
-            Default is `False`.
-        given: dict
-            Same as `conditional` method.
-        jitter: scalar
+        given : dict, optional
+            Can take key value pairs: `X`, `y`, `sigma`,
+            and `gp`. See the :ref:`section <additive_gp>` in the documentation
+            on additive GP models in pymc for more information.
+        jitter : float, default 1e-6
             A small correction added to the diagonal of positive semi-definite
             covariance matrices to ensure numerical stability.
+        model : Model, optional
+            Model with the Gaussian Process component for which predictions will
+            be generated. It is optional when inside a with context, otherwise
+            it is required.
         """
         if given is None:
             given = {}
@@ -618,17 +636,18 @@ def _predict_at(self, Xnew, diag=False, pred_noise=False, given=None, jitter=JIT
 
         Parameters
         ----------
-        Xnew: array-like
+        Xnew : array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
-        diag: bool
+        diag : bool, default False
             If `True`, return the diagonal instead of the full covariance
-            matrix.  Default is `False`.
-        pred_noise: bool
+            matrix.
+        pred_noise : bool, default False
             Whether or not observation noise is included in the conditional.
-            Default is `False`.
-        given: dict
-            Same as `conditional` method.
+        given : dict, optional
+            Can take key value pairs: `X`, `y`, `sigma`,
+            and `gp`. See the :ref:`section <additive_gp>` in the documentation
+            on additive GP models in pymc for more information.
         """
         givens = self._get_given_vals(given)
         mu, cov = self._build_conditional(Xnew, pred_noise, diag, *givens, jitter)
@@ -652,13 +671,12 @@ class MarginalApprox(Marginal):
 
     Parameters
     ----------
-    cov_func: None, 2D array, or instance of Covariance
-        The covariance function.  Defaults to zero.
-    mean_func: None, instance of Mean
-        The mean function.  Defaults to zero.
-    approx: string
+    mean_func : Mean, default ~pymc.gp.mean.Zero
+        The mean function.
+    cov_func : 2D array-like, or Covariance, default ~pymc.gp.cov.Constant
+        The covariance function.
+    approx : str, default 'VFE'
         The approximation to use.  Must be one of `VFE`, `FITC` or `DTC`.
-        Default is VFE.
 
     Examples
     --------
@@ -718,32 +736,32 @@ def __add__(self, other):
         return new_gp
 
     def _build_marginal_likelihood_loglik(self, y, X, Xu, sigma, jitter):
-        sigma2 = at.square(sigma)
+        sigma2 = pt.square(sigma)
         Kuu = self.cov_func(Xu)
         Kuf = self.cov_func(Xu, X)
         Luu = cholesky(stabilize(Kuu, jitter))
         A = solve_lower(Luu, Kuf)
-        Qffd = at.sum(A * A, 0)
+        Qffd = pt.sum(A * A, 0)
         if self.approx == "FITC":
             Kffd = self.cov_func(X, diag=True)
-            Lamd = at.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
+            Lamd = pt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
             trace = 0.0
         elif self.approx == "VFE":
-            Lamd = at.ones_like(Qffd) * sigma2
+            Lamd = pt.ones_like(Qffd) * sigma2
             trace = (1.0 / (2.0 * sigma2)) * (
-                at.sum(self.cov_func(X, diag=True)) - at.sum(at.sum(A * A, 0))
+                pt.sum(self.cov_func(X, diag=True)) - pt.sum(pt.sum(A * A, 0))
             )
         else:  # DTC
-            Lamd = at.ones_like(Qffd) * sigma2
+            Lamd = pt.ones_like(Qffd) * sigma2
             trace = 0.0
         A_l = A / Lamd
-        L_B = cholesky(at.eye(Xu.shape[0]) + at.dot(A_l, at.transpose(A)))
+        L_B = cholesky(pt.eye(Xu.shape[0]) + pt.dot(A_l, pt.transpose(A)))
         r = y - self.mean_func(X)
         r_l = r / Lamd
-        c = solve_lower(L_B, at.dot(A, r_l))
-        constant = 0.5 * X.shape[0] * at.log(2.0 * np.pi)
-        logdet = 0.5 * at.sum(at.log(Lamd)) + at.sum(at.log(at.diag(L_B)))
-        quadratic = 0.5 * (at.dot(r, r_l) - at.dot(c, c))
+        c = solve_lower(L_B, pt.dot(A, r_l))
+        constant = 0.5 * X.shape[0] * pt.log(2.0 * np.pi)
+        logdet = 0.5 * pt.sum(pt.log(Lamd)) + pt.sum(pt.log(pt.diag(L_B)))
+        quadratic = 0.5 * (pt.dot(r, r_l) - pt.dot(c, c))
         return -1.0 * (constant + logdet + quadratic + trace)
 
     def marginal_likelihood(
@@ -756,25 +774,25 @@ def marginal_likelihood(
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        X: array-like
+        X : array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
-        Xu: array-like
+        Xu : array-like
             The inducing points.  Must have the same number of columns as `X`.
-        y: array-like
+        y : array-like
             Data that is the sum of the function with the GP prior and Gaussian
             noise.  Must have shape `(n, )`.
-        sigma: scalar, Variable
+        sigma : float, Variable
             Standard deviation of the Gaussian noise.
-        noise: scalar, Variable
-            Previous parameterization of `sigma`
-        jitter: scalar
+        noise : float, Variable, optional
+            Previous parameterization of `sigma`.
+        jitter : float, default 1e-6
             A small correction added to the diagonal of positive semi-definite
             covariance matrices to ensure numerical stability.
         **kwargs
-            Extra keyword arguments that are passed to `MvNormal` distribution
+            Extra keyword arguments that are passed to :class:`~pymc.MvNormal` distribution
             constructor.
         """
 
@@ -792,36 +810,36 @@ def marginal_likelihood(
     def _build_conditional(
         self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total, jitter
     ):
-        sigma2 = at.square(sigma)
+        sigma2 = pt.square(sigma)
         Kuu = cov_total(Xu)
         Kuf = cov_total(Xu, X)
         Luu = cholesky(stabilize(Kuu, jitter))
         A = solve_lower(Luu, Kuf)
-        Qffd = at.sum(A * A, 0)
+        Qffd = pt.sum(A * A, 0)
         if self.approx == "FITC":
             Kffd = cov_total(X, diag=True)
-            Lamd = at.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
+            Lamd = pt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
         else:  # VFE or DTC
-            Lamd = at.ones_like(Qffd) * sigma2
+            Lamd = pt.ones_like(Qffd) * sigma2
         A_l = A / Lamd
-        L_B = cholesky(at.eye(Xu.shape[0]) + at.dot(A_l, at.transpose(A)))
+        L_B = cholesky(pt.eye(Xu.shape[0]) + pt.dot(A_l, pt.transpose(A)))
         r = y - mean_total(X)
         r_l = r / Lamd
-        c = solve_lower(L_B, at.dot(A, r_l))
+        c = solve_lower(L_B, pt.dot(A, r_l))
         Kus = self.cov_func(Xu, Xnew)
         As = solve_lower(Luu, Kus)
-        mu = self.mean_func(Xnew) + at.dot(at.transpose(As), solve_upper(at.transpose(L_B), c))
+        mu = self.mean_func(Xnew) + pt.dot(pt.transpose(As), solve_upper(pt.transpose(L_B), c))
         C = solve_lower(L_B, As)
         if diag:
             Kss = self.cov_func(Xnew, diag=True)
-            var = Kss - at.sum(at.square(As), 0) + at.sum(at.square(C), 0)
+            var = Kss - pt.sum(pt.square(As), 0) + pt.sum(pt.square(C), 0)
             if pred_noise:
                 var += sigma2
             return mu, var
         else:
-            cov = self.cov_func(Xnew) - at.dot(at.transpose(As), As) + at.dot(at.transpose(C), C)
+            cov = self.cov_func(Xnew) - pt.dot(pt.transpose(As), As) + pt.dot(pt.transpose(C), C)
             if pred_noise:
-                cov += sigma2 * at.identity_like(cov)
+                cov += sigma2 * pt.identity_like(cov)
             return mu, cov if pred_noise else stabilize(cov, jitter)
 
     def _get_given_vals(self, given):
@@ -848,23 +866,22 @@ def conditional(
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        Xnew: array-like
+        Xnew : array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
-        pred_noise: bool
+        pred_noise : bool, default False
             Whether or not observation noise is included in the conditional.
-            Default is `False`.
-        given: dict
-            Can optionally take as key value pairs: `X`, `Xu`, `y`, `sigma`,
-            and `gp`.  See the section in the documentation on additive GP
-            models in PyMC for more information.
-        jitter: scalar
+        given : dict, optional
+            Can take key value pairs: `X`, `Xu`, `y`, `sigma`,
+            and `gp`. See the :ref:`section <additive_gp>` in the documentation
+            on additive GP models in pymc for more information.
+        jitter : float, default 1e-6
             A small correction added to the diagonal of positive semi-definite
             covariance matrices to ensure numerical stability.
         **kwargs
-            Extra keyword arguments that are passed to `MvNormal` distribution
+            Extra keyword arguments that are passed to :class:`~pymc.MvNormal` distribution
             constructor.
         """
 
@@ -892,20 +909,19 @@ class LatentKron(Base):
     Kronecker structured covariance, without reference to any noise or
     specific likelihood.  The GP is constructed with the `prior` method,
     and the conditional GP over new input locations is constructed with
-    the `conditional` method.  `conditional` and method.  For more
+    the `conditional` method. For more
     information on these methods, see their docstrings.  This GP
     implementation can be used to model a Gaussian process whose inputs
     cover evenly spaced grids on more than one dimension.  `LatentKron`
-    is relies on the `KroneckerNormal` distribution, see its docstring
+    relies on the `KroneckerNormal` distribution, see its docstring
     for more information.
 
     Parameters
     ----------
-    cov_funcs: list of Covariance objects
+    mean_func : Mean, default ~pymc.gp.mean.Zero
+        The mean function.
+    cov_funcs : list of Covariance, default [~pymc.gp.cov.Constant]
         The covariance functions that compose the tensor (Kronecker) product.
-        Defaults to [zero].
-    mean_func: None, instance of Mean
-        The mean function.  Defaults to zero.
 
     Examples
     --------
@@ -953,7 +969,7 @@ def _build_prior(self, name, Xs, jitter, **kwargs):
         mu = self.mean_func(cartesian(*Xs))
         chols = [cholesky(stabilize(cov(X), jitter)) for cov, X in zip(self.cov_funcs, Xs)]
         v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, size=self.N, **kwargs)
-        f = pm.Deterministic(name, mu + at.flatten(kron_dot(chols, v)))
+        f = pm.Deterministic(name, mu + pt.flatten(kron_dot(chols, v)))
         return f
 
     def prior(self, name, Xs, jitter=JITTER_DEFAULT, **kwargs):
@@ -963,18 +979,18 @@ def prior(self, name, Xs, jitter=JITTER_DEFAULT, **kwargs):
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        Xs: list of array-like
+        Xs : list of array-like
             Function input values for each covariance function. Each entry
             must be passable to its respective covariance without error. The
             total covariance function is measured on the full grid
             `cartesian(*Xs)`.
-        jitter: scalar
+        jitter : float, default 1e-6
             A small correction added to the diagonal of positive semi-definite
             covariance matrices to ensure numerical stability.
         **kwargs
-            Extra keyword arguments that are passed to the `KroneckerNormal`
+            Extra keyword arguments that are passed to the :class:`~pymc.KroneckerNormal`
             distribution constructor.
         """
         if len(Xs) != len(self.cov_funcs):
@@ -991,15 +1007,15 @@ def _build_conditional(self, Xnew, jitter):
         delta = f - self.mean_func(X)
         covs = [stabilize(cov(Xi), jitter) for cov, Xi in zip(self.cov_funcs, Xs)]
         chols = [cholesky(cov) for cov in covs]
-        cholTs = [at.transpose(chol) for chol in chols]
+        cholTs = [pt.transpose(chol) for chol in chols]
         Kss = self.cov_func(Xnew)
         Kxs = self.cov_func(X, Xnew)
-        Ksx = at.transpose(Kxs)
+        Ksx = pt.transpose(Kxs)
         alpha = kron_solve_lower(chols, delta)
         alpha = kron_solve_upper(cholTs, alpha)
-        mu = at.dot(Ksx, alpha).ravel() + self.mean_func(Xnew)
+        mu = pt.dot(Ksx, alpha).ravel() + self.mean_func(Xnew)
         A = kron_solve_lower(chols, Kxs)
-        cov = stabilize(Kss - at.dot(at.transpose(A), A), jitter)
+        cov = stabilize(Kss - pt.dot(pt.transpose(A), A), jitter)
         return mu, cov
 
     def conditional(self, name, Xnew, jitter=JITTER_DEFAULT, **kwargs):
@@ -1024,16 +1040,16 @@ def conditional(self, name, Xnew, jitter=JITTER_DEFAULT, **kwargs):
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        Xnew: array-like
+        Xnew : array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
-        jitter: scalar
+        jitter : float, default 1e-6
             A small correction added to the diagonal of positive semi-definite
             covariance matrices to ensure numerical stability.
         **kwargs
-            Extra keyword arguments that are passed to `MvNormal` distribution
+            Extra keyword arguments that are passed to :class:`~pymc.MvNormal` distribution
             constructor.
         """
         mu, cov = self._build_conditional(Xnew, jitter)
@@ -1053,15 +1069,15 @@ class MarginalKron(Base):
     are measured on a full grid of inputs: `cartesian(*Xs)`.
     `MarginalKron` is based on the `KroneckerNormal` distribution, see
     its docstring for more information. For more information on the
-    `prior` and `conditional` methods, see their docstrings.
+    `marginal_likelihood`, `conditional` and `predict` methods,
+    see their docstrings.
 
     Parameters
     ----------
-    cov_funcs: list of Covariance objects
+    mean_func : Mean, default ~pymc.gp.mean.Zero
+        The mean function.
+    cov_funcs : list of Covariance, default [~pymc.gp.cov.Constant]
         The covariance functions that compose the tensor (Kronecker) product.
-        Defaults to [zero].
-    mean_func: None, instance of Mean
-        The mean function.  Defaults to zero.
 
     Examples
     --------
@@ -1131,23 +1147,22 @@ def marginal_likelihood(self, name, Xs, y, sigma, is_observed=True, **kwargs):
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        Xs: list of array-like
+        Xs : list of array-like
             Function input values for each covariance function. Each entry
             must be passable to its respective covariance without error. The
             total covariance function is measured on the full grid
             `cartesian(*Xs)`.
-        y: array-like
+        y : array-like
             Data that is the sum of the function with the GP prior and Gaussian
             noise.  Must have shape `(n, )`.
-        sigma: scalar, Variable
+        sigma : float, Variable
             Standard deviation of the white Gaussian noise.
-        is_observed: bool
-            Whether to set `y` as an `observed` variable in the `model`.
-            Default is `True`.
+        is_observed : bool, default True
+            Deprecated. Whether to set `y` as an `observed` variable in the `model`.
         **kwargs
-            Extra keyword arguments that are passed to `KroneckerNormal`
+            Extra keyword arguments that are passed to :class:`~pymc.KroneckerNormal`
             distribution constructor.
         """
         self._check_inputs(Xs, y)
@@ -1174,7 +1189,7 @@ def _build_conditional(self, Xnew, diag, pred_noise):
         delta = y - self.mean_func(X)
         Kns = [f(x) for f, x in zip(self.cov_funcs, Xs)]
         eigs_sep, Qs = zip(*map(eigh, Kns))  # Unzip
-        QTs = list(map(at.transpose, Qs))
+        QTs = list(map(pt.transpose, Qs))
         eigs = kron_diag(*eigs_sep)  # Combine separate eigs
         if sigma is not None:
             eigs += sigma**2
@@ -1187,21 +1202,21 @@ def _build_conditional(self, Xnew, diag, pred_noise):
         alpha = kron_dot(QTs, delta)
         alpha = alpha / eigs[:, None]
         alpha = kron_dot(Qs, alpha)
-        mu = at.dot(Kmn, alpha).ravel() + self.mean_func(Xnew)
+        mu = pt.dot(Kmn, alpha).ravel() + self.mean_func(Xnew)
 
         # Build conditional cov
         A = kron_dot(QTs, Knm)
-        A = A / at.sqrt(eigs[:, None])
+        A = A / pt.sqrt(eigs[:, None])
         if diag:
-            Asq = at.sum(at.square(A), 0)
+            Asq = pt.sum(pt.square(A), 0)
             cov = Km - Asq
             if pred_noise:
                 cov += sigma
         else:
-            Asq = at.dot(A.T, A)
+            Asq = pt.dot(A.T, A)
             cov = Km - Asq
             if pred_noise:
-                cov += sigma * at.identity_like(cov)
+                cov += sigma * pt.identity_like(cov)
         return mu, cov
 
     def conditional(self, name, Xnew, pred_noise=False, diag=False, **kwargs):
@@ -1226,16 +1241,15 @@ def conditional(self, name, Xnew, pred_noise=False, diag=False, **kwargs):
 
         Parameters
         ----------
-        name: string
+        name : str
             Name of the random variable
-        Xnew: array-like
+        Xnew : array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
-        pred_noise: bool
+        pred_noise : bool, default False
             Whether or not observation noise is included in the conditional.
-            Default is `False`.
         **kwargs
-            Extra keyword arguments that are passed to `MvNormal` distribution
+            Extra keyword arguments that are passed to :class:`~pymc.MvNormal` distribution
             constructor.
         """
         mu, cov = self._build_conditional(Xnew, diag, pred_noise)
@@ -1249,17 +1263,20 @@ def predict(self, Xnew, point=None, diag=False, pred_noise=False, model=None):
 
         Parameters
         ----------
-        Xnew: array-like
+        Xnew : array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
-        point: pymc.model.Point
+        point : pymc.Point, optional
             A specific point to condition on.
-        diag: bool
+        diag : bool, default False
             If `True`, return the diagonal instead of the full covariance
-            matrix.  Default is `False`.
-        pred_noise: bool
+            matrix.
+        pred_noise : bool, default False
             Whether or not observation noise is included in the conditional.
-            Default is `False`.
+        model : Model, optional
+            Model with the Gaussian Process component for which predictions will
+            be generated. It is optional when inside a with context, otherwise
+            it is required.
         """
         mu, cov = self._predict_at(Xnew, diag, pred_noise)
         return replace_with_values([mu, cov], replacements=point, model=model)
@@ -1271,15 +1288,14 @@ def _predict_at(self, Xnew, diag=False, pred_noise=False):
 
         Parameters
         ----------
-        Xnew: array-like
+        Xnew : array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
-        diag: bool
+        diag : bool, default False
             If `True`, return the diagonal instead of the full covariance
-            matrix.  Default is `False`.
-        pred_noise: bool
+            matrix.
+        pred_noise : bool, default False
             Whether or not observation noise is included in the conditional.
-            Default is `False`.
         """
         mu, cov = self._build_conditional(Xnew, diag, pred_noise)
         return mu, cov
diff --git a/pymc/gp/hsgp_approx.py b/pymc/gp/hsgp_approx.py
new file mode 100644
index 0000000000..adca16f600
--- /dev/null
+++ b/pymc/gp/hsgp_approx.py
@@ -0,0 +1,374 @@
+#   Copyright 2023 The PyMC Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import numbers
+import warnings
+
+from types import ModuleType
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import pytensor.tensor as pt
+
+import pymc as pm
+
+from pymc.gp.cov import Covariance
+from pymc.gp.gp import Base
+from pymc.gp.mean import Mean, Zero
+
+TensorLike = Union[np.ndarray, pt.TensorVariable]
+
+
+def set_boundary(Xs: TensorLike, c: Union[numbers.Real, TensorLike]) -> TensorLike:
+    """Set the boundary using the mean-subtracted `Xs` and `c`.  `c` is usually a scalar
+    multiplyer greater than 1.0, but it may be one value per dimension or column of `Xs`.
+    """
+    S = pt.max(pt.abs(Xs), axis=0)
+    L = c * S
+    return L
+
+
+def calc_eigenvalues(L: TensorLike, m: Sequence[int], tl: ModuleType = np):
+    """Calculate eigenvalues of the Laplacian."""
+    S = np.meshgrid(*[np.arange(1, 1 + m[d]) for d in range(len(m))])
+    S_arr = np.vstack([s.flatten() for s in S]).T
+    return tl.square((np.pi * S_arr) / (2 * L))
+
+
+def calc_eigenvectors(
+    Xs: TensorLike,
+    L: TensorLike,
+    eigvals: TensorLike,
+    m: Sequence[int],
+    tl: ModuleType = np,
+):
+    """Calculate eigenvectors of the Laplacian.  These are used as basis vectors in the HSGP
+    approximation.
+    """
+    m_star = int(np.prod(m))
+    phi = tl.ones((Xs.shape[0], m_star))
+    for d in range(len(m)):
+        c = 1.0 / tl.sqrt(L[d])
+        term1 = tl.sqrt(eigvals[:, d])
+        term2 = tl.tile(Xs[:, d][:, None], m_star) + L[d]
+        phi *= c * tl.sin(term1 * term2)
+    return phi
+
+
+class HSGP(Base):
+    R"""
+    Hilbert Space Gaussian process approximation.
+
+    The `gp.HSGP` class is an implementation of the Hilbert Space Gaussian process.  It is a
+    reduced rank GP approximation that uses a fixed set of basis vectors whose coefficients are
+    random functions of a stationary covariance function's power spectral density.  It's usage
+    is largely similar to `gp.Latent`.  Like `gp.Latent`, it does not assume a Gaussian noise model
+    and can be used with any likelihood, or as a component anywhere within a model.  Also like
+    `gp.Latent`, it has `prior` and `conditional` methods.  It supports any sum of covariance
+    functions that implement a `power_spectral_density` method.
+
+    For information on choosing appropriate `m`, `L`, and `c`, refer Ruitort-Mayol et. al. or to
+    the PyMC examples that use HSGP.
+
+    To with with the HSGP in its "linearized" form, as a matrix of basis vectors and and vector of
+    coefficients, see the method `prior_linearized`.
+
+    Parameters
+    ----------
+    m: list
+        The number of basis vectors to use for each active dimension (covariance parameter
+        `active_dim`).
+    L: list
+        The boundary of the space for each `active_dim`.  It is called the boundary condition.
+        Choose L such that the domain `[-L, L]` contains all points in the column of X given by the
+        `active_dim`.
+    c: float
+        The proportion extension factor.  Used to construct L from X.  Defined as `S = max|X|` such
+        that `X` is in `[-S, S]`.  `L` is the calculated as `c * S`.  One of `c` or `L` must be
+        provided.  Further information can be found in Ruitort-Mayol et. al.
+    drop_first: bool
+        Default `False`. Sometimes the first basis vector is quite "flat" and very similar to
+        the intercept term.  When there is an intercept in the model, ignoring the first basis
+        vector may improve sampling.
+    cov_func: None, 2D array, or instance of Covariance
+        The covariance function.  Defaults to zero.
+    mean_func: None, instance of Mean
+        The mean function.  Defaults to zero.
+
+    Examples
+    --------
+    .. code:: python
+
+        # A three dimensional column vector of inputs.
+        X = np.random.rand(100, 3)
+
+        with pm.Model() as model:
+            # Specify the covariance function.
+            # Three input dimensions, but we only want to use the last two.
+            cov_func = pm.gp.cov.ExpQuad(3, ls=0.1, active_dims=[1, 2])
+
+            # Specify the HSGP.
+            # Use 25 basis vectors across each active dimension for a total of 25 * 25 = 625.
+            # The value `c = 4` means the boundary of the approximation
+            # lies at four times the half width of the data.
+            # In this example the data lie between zero and one,
+            # so the boundaries occur at -1.5 and 2.5.  The data, both for
+            # training and prediction should reside well within that boundary..
+            gp = pm.gp.HSGP(m=[25, 25], c=4.0, cov_func=cov_func)
+
+            # Place a GP prior over the function f.
+            f = gp.prior("f", X=X)
+
+        ...
+
+        # After fitting or sampling, specify the distribution
+        # at new points with .conditional
+        Xnew = np.linspace(-1, 2, 50)[:, None]
+
+        with model:
+            fcond = gp.conditional("fcond", Xnew=Xnew)
+
+    References
+    ----------
+    -   Ruitort-Mayol, G., and Anderson, M., and Solin, A., and Vehtari, A. (2022). Practical
+        Hilbert Space Approximate Bayesian Gaussian Processes for Probabilistic Programming
+
+    -   Solin, A., Sarkka, S. (2019) Hilbert Space Methods for Reduced-Rank Gaussian Process
+        Regression.
+    """
+
+    def __init__(
+        self,
+        m: Sequence[int],
+        L: Optional[Sequence[float]] = None,
+        c: Optional[numbers.Real] = None,
+        drop_first: bool = False,
+        parameterization="noncentered",
+        *,
+        mean_func: Mean = Zero(),
+        cov_func: Covariance,
+    ):
+        arg_err_msg = (
+            "`m` and L, if provided, must be sequences with one element per active "
+            "dimension of the kernel or covariance function."
+        )
+
+        if not isinstance(m, Sequence):
+            raise ValueError(arg_err_msg)
+
+        if len(m) != cov_func.n_dims:
+            raise ValueError(arg_err_msg)
+        m = tuple(m)
+
+        if (L is None and c is None) or (L is not None and c is not None):
+            raise ValueError("Provide one of `c` or `L`")
+
+        if L is not None and (not isinstance(L, Sequence) or len(L) != cov_func.n_dims):
+            raise ValueError(arg_err_msg)
+
+        if L is None and c is not None and c < 1.2:
+            warnings.warn("For an adequate approximation `c >= 1.2` is recommended.")
+
+        parameterization = parameterization.lower().replace("-", "").replace("_", "")
+        if parameterization not in ["centered", "noncentered"]:
+            raise ValueError("`parameterization` must be either 'centered' or 'noncentered'.")
+        else:
+            self._parameterization = parameterization
+
+        self._drop_first = drop_first
+        self._m = m
+        self._m_star = int(np.prod(self._m))
+        self._L = L
+        self._c = c
+
+        super().__init__(mean_func=mean_func, cov_func=cov_func)
+
+    def __add__(self, other):
+        raise NotImplementedError("Additive HSGPs aren't supported.")
+
+    @property
+    def L(self):
+        if self._L is None:
+            raise RuntimeError("Boundaries `L` required but still unset.")
+        return self._L
+
+    @L.setter
+    def L(self, value):
+        self._L = pt.as_tensor_variable(value)
+
+    def prior_linearized(self, Xs: TensorLike):
+        """Linearized version of the HSGP.  Returns the Laplace eigenfunctions and the square root
+        of the power spectral density needed to create the GP.
+
+        This function allows the user to bypass the GP interface and work directly with the basis
+        and coefficients directly.  This format allows the user to create predictions using
+        `pm.set_data` similarly to a linear model.  It also enables computational speed ups in
+        multi-GP models since they may share the same basis.  The return values are the Laplace
+        eigenfunctions `phi`, and the square root of the power spectral density.
+
+        Correct results when using `prior_linearized` in tandem with `pm.set_data` and
+        `pm.MutableData` require two conditions.  First, one must specify `L` instead of `c` when
+        the GP is constructed.  If not, a RuntimeError is raised.  Second, the `Xs` needs to be
+        zero-centered, so it's mean must be subtracted.  An example is given below.
+
+        Parameters
+        ----------
+        Xs: array-like
+            Function input values.  Assumes they have been mean subtracted or centered at zero.
+
+        Returns
+        -------
+        phi: array-like
+            Either Numpy or PyTensor 2D array of the fixed basis vectors.  There are n rows, one
+            per row of `Xs` and `prod(m)` columns, one for each basis vector.
+        sqrt_psd: array-like
+            Either a Numpy or PyTensor 1D array of the square roots of the power spectral
+            densities.
+
+        Examples
+        --------
+        .. code:: python
+
+            # A one dimensional column vector of inputs.
+            X = np.linspace(0, 10, 100)[:, None]
+
+            with pm.Model() as model:
+                eta = pm.Exponential("eta", lam=1.0)
+                ell = pm.InverseGamma("ell", mu=5.0, sigma=5.0)
+                cov_func = eta**2 * pm.gp.cov.ExpQuad(1, ls=ell)
+
+                # m = [200] means 200 basis vectors for the first dimenison
+                # L = [10] means the approximation is valid from Xs = [-10, 10]
+                gp = pm.gp.HSGP(m=[200], L=[10], cov_func=cov_func)
+
+                # Order is important.  First calculate the mean, then make X a shared variable,
+                # then subtract the mean.  When X is mutated later, the correct mean will be
+                # subtracted.
+                X_mean = np.mean(X, axis=0)
+                X = pm.MutableData("X", X)
+                Xs = X - X_mean
+
+                # Pass the zero-subtracted Xs in to the GP
+                phi, sqrt_psd = gp.prior_linearized(Xs=Xs)
+
+                # Specify standard normal prior in the coefficients.  The number of which
+                # is given by the number of basis vectors, which is also saved in the GP object
+                # as m_star.
+                beta = pm.Normal("beta", size=gp.m_star)
+
+                # The (non-centered) GP approximation is given by
+                f = pm.Deterministic("f", phi @ (beta * sqrt_psd))
+
+                ...
+
+
+            # Then it works just like a linear regression to predict on new data.
+            # First mutate the data X,
+            x_new = np.linspace(-10, 10, 100)
+            with model:
+                model.set_data("X", x_new[:, None])
+
+            # and then make predictions for the GP using posterior predictive sampling.
+            with model:
+                ppc = pm.sample_posterior_predictive(idata, var_names=["f"])
+        """
+
+        # Index Xs using input_dim and active_dims of covariance function
+        Xs, _ = self.cov_func._slice(Xs)
+
+        # If not provided, use Xs and c to set L
+        if self._L is None:
+            assert isinstance(self._c, (numbers.Real, np.ndarray, pt.TensorVariable))
+            self.L = set_boundary(Xs, self._c)
+        else:
+            self.L = self._L
+
+        eigvals = calc_eigenvalues(self.L, self._m, tl=pt)
+        phi = calc_eigenvectors(Xs, self.L, eigvals, self._m, tl=pt)
+        omega = pt.sqrt(eigvals)
+        psd = self.cov_func.power_spectral_density(omega)
+
+        i = int(self._drop_first == True)
+        return phi[:, i:], pt.sqrt(psd[i:])
+
+    def prior(self, name: str, X: TensorLike, dims: Optional[str] = None):  # type: ignore
+        R"""
+        Returns the (approximate) GP prior distribution evaluated over the input locations `X`.
+        For usage examples, refer to `pm.gp.Latent`.
+
+        Parameters
+        ----------
+        name: str
+            Name of the random variable
+        X: array-like
+            Function input values.
+        dims: None
+            Dimension name for the GP random variable.
+        """
+        self._X_mean = pt.mean(X, axis=0)
+        phi, sqrt_psd = self.prior_linearized(X - self._X_mean)
+
+        if self._parameterization == "noncentered":
+            self._beta = pm.Normal(
+                f"{name}_hsgp_coeffs_", size=self._m_star - int(self._drop_first)
+            )
+            self._sqrt_psd = sqrt_psd
+            f = self.mean_func(X) + phi @ (self._beta * self._sqrt_psd)
+
+        elif self._parameterization == "centered":
+            self._beta = pm.Normal(f"{name}_hsgp_coeffs_", sigma=sqrt_psd)
+            f = self.mean_func(X) + phi @ self._beta
+
+        self.f = pm.Deterministic(name, f, dims=dims)
+        return self.f
+
+    def _build_conditional(self, Xnew):
+        try:
+            beta, X_mean = self._beta, self._X_mean
+
+            if self._parameterization == "noncentered":
+                sqrt_psd = self._sqrt_psd
+
+        except AttributeError:
+            raise ValueError(
+                "Prior is not set, can't create a conditional.  Call `.prior(name, X)` first."
+            )
+
+        Xnew, _ = self.cov_func._slice(Xnew)
+        eigvals = calc_eigenvalues(self.L, self._m, tl=pt)
+        phi = calc_eigenvectors(Xnew - X_mean, self.L, eigvals, self._m, tl=pt)
+        i = int(self._drop_first == True)
+
+        if self._parameterization == "noncentered":
+            return self.mean_func(Xnew) + phi[:, i:] @ (beta * sqrt_psd)
+
+        elif self._parameterization == "centered":
+            return self.mean_func(Xnew) + phi[:, i:] @ beta
+
+    def conditional(self, name: str, Xnew: TensorLike, dims: Optional[str] = None):  # type: ignore
+        R"""
+        Returns the (approximate) conditional distribution evaluated over new input locations
+        `Xnew`.
+
+        Parameters
+        ----------
+        name
+            Name of the random variable
+        Xnew : array-like
+            Function input values.
+        dims: None
+            Dimension name for the GP random variable.
+        """
+        fnew = self._build_conditional(Xnew)
+        return pm.Deterministic(name, fnew, dims=dims)
diff --git a/pymc/gp/mean.py b/pymc/gp/mean.py
index ccde742e99..b1146ca22f 100644
--- a/pymc/gp/mean.py
+++ b/pymc/gp/mean.py
@@ -12,7 +12,7 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 __all__ = ["Zero", "Constant", "Linear"]
 
@@ -46,7 +46,7 @@ class Zero(Mean):
     """
 
     def __call__(self, X):
-        return at.alloc(0.0, X.shape[0])
+        return pt.alloc(0.0, X.shape[0])
 
 
 class Constant(Mean):
@@ -64,7 +64,7 @@ def __init__(self, c=0):
         self.c = c
 
     def __call__(self, X):
-        return at.alloc(1.0, X.shape[0]) * self.c
+        return pt.alloc(1.0, X.shape[0]) * self.c
 
 
 class Linear(Mean):
@@ -85,7 +85,7 @@ def __init__(self, coeffs, intercept=0):
         self.A = coeffs
 
     def __call__(self, X):
-        return at.squeeze(at.dot(X, self.A) + self.b)
+        return pt.squeeze(pt.dot(X, self.A) + self.b)
 
 
 class Add(Mean):
@@ -95,7 +95,7 @@ def __init__(self, first_mean, second_mean):
         self.m2 = second_mean
 
     def __call__(self, X):
-        return at.add(self.m1(X), self.m2(X))
+        return pt.add(self.m1(X), self.m2(X))
 
 
 class Prod(Mean):
@@ -105,4 +105,4 @@ def __init__(self, first_mean, second_mean):
         self.m2 = second_mean
 
     def __call__(self, X):
-        return at.mul(self.m1(X), self.m2(X))
+        return pt.mul(self.m1(X), self.m2(X))
diff --git a/pymc/gp/util.py b/pymc/gp/util.py
index f2ae803895..7a418f7a6b 100644
--- a/pymc/gp/util.py
+++ b/pymc/gp/util.py
@@ -15,7 +15,7 @@
 import warnings
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.compile import SharedVariable
 from pytensor.tensor.slinalg import (  # noqa: W0611; pylint: disable=unused-import
@@ -100,7 +100,7 @@ def stabilize(K, jitter=JITTER_DEFAULT):
     jitter: float
         A small constant.
     """
-    return K + jitter * at.identity_like(K)
+    return K + jitter * pt.identity_like(K)
 
 
 def kmeans_inducing_points(n_inducing, X, **kmeans_kwargs):
diff --git a/pymc/initial_point.py b/pymc/initial_point.py
index 8c914539a9..dfdca37c61 100644
--- a/pymc/initial_point.py
+++ b/pymc/initial_point.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.basic import Variable
 from pytensor.graph.fg import FunctionGraph
@@ -243,7 +243,7 @@ def make_initial_point_expression(
                     f'Invalid string strategy: {strategy}. It must be one of ["moment", "prior"]'
                 )
         else:
-            value = at.as_tensor(strategy, dtype=variable.dtype).astype(variable.dtype)
+            value = pt.as_tensor(strategy, dtype=variable.dtype).astype(variable.dtype)
 
         transform = rvs_to_transforms.get(variable, None)
 
@@ -251,7 +251,7 @@ def make_initial_point_expression(
             value = transform.forward(value, *variable.owner.inputs)
 
         if variable in jitter_rvs:
-            jitter = at.random.uniform(-1, 1, size=value.shape)
+            jitter = pt.random.uniform(-1, 1, size=value.shape)
             jitter.name = f"{variable.name}_jitter"
             value = value + jitter
 
diff --git a/pymc/logprob/__init__.py b/pymc/logprob/__init__.py
index 06b0c78edd..f6ae51408c 100644
--- a/pymc/logprob/__init__.py
+++ b/pymc/logprob/__init__.py
@@ -34,14 +34,13 @@
 #   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 #   SOFTWARE.
 
-from pymc.logprob.abstract import logprob, logcdf  # isort: split
-
-from pymc.logprob.joint_logprob import factorized_joint_logprob, joint_logp, logp
+from pymc.logprob.basic import factorized_joint_logprob, icdf, joint_logp, logcdf, logp
 
 # isort: off
 # Add rewrites to the DBs
 import pymc.logprob.censoring
 import pymc.logprob.cumsum
+import pymc.logprob.checks
 import pymc.logprob.mixture
 import pymc.logprob.scan
 import pymc.logprob.tensor
@@ -49,4 +48,8 @@
 
 # isort: on
 
-__all__ = ("logp", "logcdf")
+__all__ = (
+    "logp",
+    "logcdf",
+    "icdf",
+)
diff --git a/pymc/logprob/abstract.py b/pymc/logprob/abstract.py
index ff308773b8..52ba5149d5 100644
--- a/pymc/logprob/abstract.py
+++ b/pymc/logprob/abstract.py
@@ -48,35 +48,6 @@
 from pytensor.tensor.random.op import RandomVariable
 
 
-def logprob(rv_var, *rv_values, **kwargs):
-    """Create a graph for the log-probability of a ``RandomVariable``."""
-    logprob = _logprob(rv_var.owner.op, rv_values, *rv_var.owner.inputs, **kwargs)
-
-    for rv_var in rv_values:
-        if rv_var.name:
-            logprob.name = f"{rv_var.name}_logprob"
-
-    return logprob
-
-
-def logcdf(rv_var, rv_value, **kwargs):
-    """Create a graph for the logcdf of a ``RandomVariable``."""
-    logcdf = _logcdf(rv_var.owner.op, rv_value, *rv_var.owner.inputs, name=rv_var.name, **kwargs)
-
-    if rv_var.name:
-        logcdf.name = f"{rv_var.name}_logcdf"
-
-    return logcdf
-
-
-def icdf(rv, value, **kwargs):
-    """Create a graph for the inverse CDF of a `RandomVariable`."""
-    rv_icdf = _icdf(rv.owner.op, value, *rv.owner.inputs, **kwargs)
-    if rv.name:
-        rv_icdf.name = f"{rv.name}_icdf"
-    return rv_icdf
-
-
 @singledispatch
 def _logprob(
     op: Op,
@@ -94,6 +65,18 @@ def _logprob(
     raise NotImplementedError(f"Logprob method not implemented for {op}")
 
 
+def _logprob_helper(rv, *values, **kwargs):
+    """Helper that calls `_logprob` dispatcher."""
+    logprob = _logprob(rv.owner.op, values, *rv.owner.inputs, **kwargs)
+
+    for rv in values:
+        if rv.name:
+            logprob.name = f"{rv.name}_logprob"
+            break
+
+    return logprob
+
+
 @singledispatch
 def _logcdf(
     op: Op,
@@ -107,7 +90,17 @@ def _logcdf(
     of ``RandomVariable``.  If you want to implement new logcdf graphs
     for a ``RandomVariable``, register a new function on this dispatcher.
     """
-    raise NotImplementedError(f"Logcdf method not implemented for {op}")
+    raise NotImplementedError(f"LogCDF method not implemented for {op}")
+
+
+def _logcdf_helper(rv, value, **kwargs):
+    """Helper that calls `_logcdf` dispatcher."""
+    logcdf = _logcdf(rv.owner.op, value, *rv.owner.inputs, name=rv.name, **kwargs)
+
+    if rv.name:
+        logcdf.name = f"{rv.name}_logcdf"
+
+    return logcdf
 
 
 @singledispatch
@@ -122,7 +115,17 @@ def _icdf(
     This function dispatches on the type of `op`, which should be a subclass
     of `RandomVariable`.
     """
-    raise NotImplementedError(f"icdf not implemented for {op}")
+    raise NotImplementedError(f"Inverse CDF method not implemented for {op}")
+
+
+def _icdf_helper(rv, value, **kwargs):
+    """Helper that calls `_icdf` dispatcher."""
+    rv_icdf = _icdf(rv.owner.op, value, *rv.owner.inputs, **kwargs)
+
+    if rv.name:
+        rv_icdf.name = f"{rv.name}_icdf"
+
+    return rv_icdf
 
 
 class MeasurableVariable(abc.ABC):
diff --git a/pymc/logprob/joint_logprob.py b/pymc/logprob/basic.py
similarity index 75%
rename from pymc/logprob/joint_logprob.py
rename to pymc/logprob/basic.py
index d5548aa930..a8d4221f06 100644
--- a/pymc/logprob/joint_logprob.py
+++ b/pymc/logprob/basic.py
@@ -39,40 +39,105 @@
 from collections import deque
 from typing import Dict, List, Optional, Sequence, Union
 
+import numpy as np
 import pytensor
 import pytensor.tensor as pt
 
 from pytensor import config
-from pytensor.graph.basic import graph_inputs, io_toposort
+from pytensor.graph.basic import Variable, graph_inputs, io_toposort
 from pytensor.graph.op import compute_test_value
 from pytensor.graph.rewriting.basic import GraphRewriter, NodeRewriter
 from pytensor.tensor.random.op import RandomVariable
 from pytensor.tensor.var import TensorVariable
-
-from pymc.logprob.abstract import _logprob, get_measurable_outputs
-from pymc.logprob.abstract import logprob as logp_logprob
+from typing_extensions import TypeAlias
+
+from pymc.logprob.abstract import (
+    _icdf_helper,
+    _logcdf_helper,
+    _logprob,
+    _logprob_helper,
+    get_measurable_outputs,
+)
 from pymc.logprob.rewriting import construct_ir_fgraph
 from pymc.logprob.transforms import RVTransform, TransformValuesRewrite
 from pymc.logprob.utils import rvs_to_value_vars
 
+TensorLike: TypeAlias = Union[Variable, float, np.ndarray]
+
+
+def _warn_rvs_in_inferred_graph(graph: Sequence[TensorVariable]):
+    """Issue warning if any RVs are found in graph.
+
+    RVs are usually an (implicit) conditional input of the derived probability expression,
+    and meant to be replaced by respective value variables before evaluation.
+    However, when the IR graph is built, any non-input nodes (including RVs) are cloned,
+    breaking the link with the original ones.
+    This makes it impossible (or difficult) to replace it by the respective values afterward,
+    so we instruct users to do it beforehand.
+    """
+    from pymc.testing import assert_no_rvs
+
+    try:
+        assert_no_rvs(graph)
+    except AssertionError:
+        warnings.warn(
+            "RandomVariables were found in the derived graph. "
+            "These variables are a clone and do not match the original ones on identity.\n"
+            "If you are deriving a quantity that depends on model RVs, use `model.replace_rvs_by_values` first. For example: "
+            "`logp(model.replace_rvs_by_values([rv])[0], value)`",
+            stacklevel=3,
+        )
+
 
-def logp(rv: TensorVariable, value) -> TensorVariable:
+def logp(
+    rv: TensorVariable, value: TensorLike, warn_missing_rvs: bool = True, **kwargs
+) -> TensorVariable:
     """Return the log-probability graph of a Random Variable"""
 
     value = pt.as_tensor_variable(value, dtype=rv.dtype)
     try:
-        return logp_logprob(rv, value)
+        return _logprob_helper(rv, value, **kwargs)
+    except NotImplementedError:
+        fgraph, _, _ = construct_ir_fgraph({rv: value})
+        [(ir_rv, ir_value)] = fgraph.preserve_rv_mappings.rv_values.items()
+        expr = _logprob_helper(ir_rv, ir_value, **kwargs)
+        if warn_missing_rvs:
+            _warn_rvs_in_inferred_graph(expr)
+        return expr
+
+
+def logcdf(
+    rv: TensorVariable, value: TensorLike, warn_missing_rvs: bool = True, **kwargs
+) -> TensorVariable:
+    """Create a graph for the log-CDF of a Random Variable."""
+    value = pt.as_tensor_variable(value, dtype=rv.dtype)
+    try:
+        return _logcdf_helper(rv, value, **kwargs)
+    except NotImplementedError:
+        # Try to rewrite rv
+        fgraph, rv_values, _ = construct_ir_fgraph({rv: value})
+        [ir_rv] = fgraph.outputs
+        expr = _logcdf_helper(ir_rv, value, **kwargs)
+        if warn_missing_rvs:
+            _warn_rvs_in_inferred_graph(expr)
+        return expr
+
+
+def icdf(
+    rv: TensorVariable, value: TensorLike, warn_missing_rvs: bool = True, **kwargs
+) -> TensorVariable:
+    """Create a graph for the inverse CDF of a  Random Variable."""
+    value = pt.as_tensor_variable(value, dtype="floatX")
+    try:
+        return _icdf_helper(rv, value, **kwargs)
     except NotImplementedError:
-        try:
-            value = rv.type.filter_variable(value)
-        except TypeError as exc:
-            raise TypeError(
-                "When RV is not a pure distribution, value variable must have the same type"
-            ) from exc
-        try:
-            return factorized_joint_logprob({rv: value}, warn_missing_rvs=False)[value]
-        except Exception as exc:
-            raise NotImplementedError("PyMC could not infer logp of input variable.") from exc
+        # Try to rewrite rv
+        fgraph, rv_values, _ = construct_ir_fgraph({rv: value})
+        [ir_rv] = fgraph.outputs
+        expr = _icdf_helper(ir_rv, value, **kwargs)
+        if warn_missing_rvs:
+            _warn_rvs_in_inferred_graph(expr)
+        return expr
 
 
 def factorized_joint_logprob(
@@ -92,10 +157,10 @@ def factorized_joint_logprob(
 
     .. code-block:: python
 
-        import pytensor.tensor as at
+        import pytensor.tensor as pt
 
-        sigma2_rv = at.random.invgamma(0.5, 0.5)
-        Y_rv = at.random.normal(0, at.sqrt(sigma2_rv))
+        sigma2_rv = pt.random.invgamma(0.5, 0.5)
+        Y_rv = pt.random.normal(0, pt.sqrt(sigma2_rv))
 
     This graph for ``Y_rv`` is equivalent to the following hierarchical model:
 
@@ -104,11 +169,11 @@ def factorized_joint_logprob(
         \sigma^2 \sim& \operatorname{InvGamma}(0.5, 0.5) \\
         Y \sim& \operatorname{N}(0, \sigma^2)
 
-    If we create a value variable for ``Y_rv``, i.e. ``y_vv = at.scalar("y")``,
+    If we create a value variable for ``Y_rv``, i.e. ``y_vv = pt.scalar("y")``,
     the graph of ``factorized_joint_logprob({Y_rv: y_vv})`` is equivalent to the
     conditional probability :math:`\log p(Y = y \mid \sigma^2)`, with a stochastic
     ``sigma2_rv``. If we specify a value variable for ``sigma2_rv``, i.e.
-    ``s_vv = at.scalar("s2")``, then ``factorized_joint_logprob({Y_rv: y_vv, sigma2_rv: s_vv})``
+    ``s_vv = pt.scalar("s2")``, then ``factorized_joint_logprob({Y_rv: y_vv, sigma2_rv: s_vv})``
     yields the joint log-probability of the two variables.
 
     .. math::
@@ -189,7 +254,8 @@ def factorized_joint_logprob(
             if warn_missing_rvs:
                 warnings.warn(
                     "Found a random variable that was neither among the observations "
-                    f"nor the conditioned variables: {node.outputs}"
+                    f"nor the conditioned variables: {outputs}.\n"
+                    "This variables is a clone and does not match the original one on identity."
                 )
             continue
 
diff --git a/pymc/logprob/censoring.py b/pymc/logprob/censoring.py
index 612fc59a65..57a2660d51 100644
--- a/pymc/logprob/censoring.py
+++ b/pymc/logprob/censoring.py
@@ -37,7 +37,7 @@
 from typing import List, Optional
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.basic import Node
 from pytensor.graph.fg import FunctionGraph
@@ -52,10 +52,9 @@
     MeasurableVariable,
     _logcdf,
     _logprob,
-    assign_custom_measurable_outputs,
 )
 from pymc.logprob.rewriting import measurable_ir_rewrites_db
-from pymc.logprob.utils import CheckParameterValue
+from pymc.logprob.utils import CheckParameterValue, ignore_logprob
 
 
 class MeasurableClip(MeasurableElemwise):
@@ -91,11 +90,11 @@ def find_measurable_clips(fgraph: FunctionGraph, node: Node) -> Optional[List[Me
     # Replace bounds by `+-inf` if `y = clip(x, x, ?)` or `y=clip(x, ?, x)`
     # This is used in `clip_logprob` to generate a more succinct logprob graph
     # for one-sided clipped random variables
-    lower_bound = lower_bound if (lower_bound is not base_var) else at.constant(-np.inf)
-    upper_bound = upper_bound if (upper_bound is not base_var) else at.constant(np.inf)
+    lower_bound = lower_bound if (lower_bound is not base_var) else pt.constant(-np.inf)
+    upper_bound = upper_bound if (upper_bound is not base_var) else pt.constant(np.inf)
 
     # Make base_var unmeasurable
-    unmeasurable_base_var = assign_custom_measurable_outputs(base_var.owner)
+    unmeasurable_base_var = ignore_logprob(base_var)
     clipped_rv_node = measurable_clip.make_node(unmeasurable_base_var, lower_bound, upper_bound)
     clipped_rv = clipped_rv_node.outputs[0]
 
@@ -143,28 +142,28 @@ def clip_logprob(op, values, base_rv, lower_bound, upper_bound, **kwargs):
     if not (isinstance(upper_bound, TensorConstant) and np.all(np.isinf(upper_bound.value))):
         is_upper_bounded = True
 
-        logccdf = at.log1mexp(logcdf)
+        logccdf = pt.log1mexp(logcdf)
         # For right clipped discrete RVs, we need to add an extra term
         # corresponding to the pmf at the upper bound
         if base_rv.dtype.startswith("int"):
-            logccdf = at.logaddexp(logccdf, logprob)
+            logccdf = pt.logaddexp(logccdf, logprob)
 
-        logprob = at.switch(
-            at.eq(value, upper_bound),
+        logprob = pt.switch(
+            pt.eq(value, upper_bound),
             logccdf,
-            at.switch(at.gt(value, upper_bound), -np.inf, logprob),
+            pt.switch(pt.gt(value, upper_bound), -np.inf, logprob),
         )
     if not (isinstance(lower_bound, TensorConstant) and np.all(np.isneginf(lower_bound.value))):
         is_lower_bounded = True
-        logprob = at.switch(
-            at.eq(value, lower_bound),
+        logprob = pt.switch(
+            pt.eq(value, lower_bound),
             logcdf,
-            at.switch(at.lt(value, lower_bound), -np.inf, logprob),
+            pt.switch(pt.lt(value, lower_bound), -np.inf, logprob),
         )
 
     if is_lower_bounded and is_upper_bounded:
         logprob = CheckParameterValue("lower_bound <= upper_bound")(
-            logprob, at.all(at.le(lower_bound, upper_bound))
+            logprob, pt.all(pt.le(lower_bound, upper_bound))
         )
 
     return logprob
@@ -198,7 +197,7 @@ def find_measurable_roundings(fgraph: FunctionGraph, node: Node) -> Optional[Lis
         return None
 
     # Make base_var unmeasurable
-    unmeasurable_base_var = assign_custom_measurable_outputs(base_var.owner)
+    unmeasurable_base_var = ignore_logprob(base_var)
 
     rounded_op = MeasurableRound(node.op.scalar_op)
     rounded_rv = rounded_op.make_node(unmeasurable_base_var).default_output()
@@ -243,15 +242,15 @@ def round_logprob(op, values, base_rv, **kwargs):
     (value,) = values
 
     if isinstance(op.scalar_op, RoundHalfToEven):
-        value = at.round(value)
+        value = pt.round(value)
         value_upper = value + 0.5
         value_lower = value - 0.5
     elif isinstance(op.scalar_op, Floor):
-        value = at.floor(value)
+        value = pt.floor(value)
         value_upper = value + 1.0
         value_lower = value
     elif isinstance(op.scalar_op, Ceil):
-        value = at.ceil(value)
+        value = pt.ceil(value)
         value_upper = value
         value_lower = value - 1.0
     else:
diff --git a/pymc/logprob/checks.py b/pymc/logprob/checks.py
new file mode 100644
index 0000000000..914c0c4b2d
--- /dev/null
+++ b/pymc/logprob/checks.py
@@ -0,0 +1,158 @@
+#   Copyright 2023 The PyMC Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+#   MIT License
+#
+#   Copyright (c) 2021-2022 aesara-devs
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in all
+#   copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#   SOFTWARE.
+
+from typing import List, Optional
+
+import pytensor.tensor as pt
+
+from pytensor.graph.rewriting.basic import node_rewriter
+from pytensor.raise_op import CheckAndRaise, ExceptionType
+from pytensor.tensor.shape import SpecifyShape
+
+from pymc.logprob.abstract import MeasurableVariable, _logprob, _logprob_helper
+from pymc.logprob.rewriting import PreserveRVMappings, measurable_ir_rewrites_db
+from pymc.logprob.utils import ignore_logprob
+
+
+class MeasurableSpecifyShape(SpecifyShape):
+    """A placeholder used to specify a log-likelihood for a specify-shape sub-graph."""
+
+
+MeasurableVariable.register(MeasurableSpecifyShape)
+
+
+@_logprob.register(MeasurableSpecifyShape)
+def logprob_specify_shape(op, values, inner_rv, *shapes, **kwargs):
+    (value,) = values
+    # transfer specify_shape from rv to value
+    value = pt.specify_shape(value, shapes)
+    return _logprob_helper(inner_rv, value)
+
+
+@node_rewriter([SpecifyShape])
+def find_measurable_specify_shapes(fgraph, node) -> Optional[List[MeasurableSpecifyShape]]:
+    r"""Finds `SpecifyShapeOp`\s for which a `logprob` can be computed."""
+
+    if isinstance(node.op, MeasurableSpecifyShape):
+        return None  # pragma: no cover
+
+    rv_map_feature: Optional[PreserveRVMappings] = getattr(fgraph, "preserve_rv_mappings", None)
+
+    if rv_map_feature is None:
+        return None  # pragma: no cover
+
+    rv = node.outputs[0]
+
+    base_rv, *shape = node.inputs
+
+    if not (
+        base_rv.owner
+        and isinstance(base_rv.owner.op, MeasurableVariable)
+        and base_rv not in rv_map_feature.rv_values
+    ):
+        return None  # pragma: no cover
+
+    new_op = MeasurableSpecifyShape()
+    # Make base_var unmeasurable
+    unmeasurable_base_rv = ignore_logprob(base_rv)
+    new_rv = new_op.make_node(unmeasurable_base_rv, *shape).default_output()
+    new_rv.name = rv.name
+
+    return [new_rv]
+
+
+measurable_ir_rewrites_db.register(
+    "find_measurable_specify_shapes",
+    find_measurable_specify_shapes,
+    "basic",
+    "specify_shape",
+)
+
+
+class MeasurableCheckAndRaise(CheckAndRaise):
+    """A placeholder used to specify a log-likelihood for an assert sub-graph."""
+
+
+MeasurableVariable.register(MeasurableCheckAndRaise)
+
+
+@_logprob.register(MeasurableCheckAndRaise)
+def logprob_assert(op, values, inner_rv, *assertion, **kwargs):
+    (value,) = values
+    # transfer assertion from rv to value
+    value = op(assertion, value)
+    return _logprob_helper(inner_rv, value)
+
+
+@node_rewriter([CheckAndRaise])
+def find_measurable_asserts(fgraph, node) -> Optional[List[MeasurableCheckAndRaise]]:
+    r"""Finds `AssertOp`\s for which a `logprob` can be computed."""
+
+    if isinstance(node.op, MeasurableCheckAndRaise):
+        return None  # pragma: no cover
+
+    rv_map_feature: Optional[PreserveRVMappings] = getattr(fgraph, "preserve_rv_mappings", None)
+
+    if rv_map_feature is None:
+        return None  # pragma: no cover
+
+    rv = node.outputs[0]
+
+    base_rv, *conds = node.inputs
+
+    if not (
+        base_rv.owner
+        and isinstance(base_rv.owner.op, MeasurableVariable)
+        and base_rv not in rv_map_feature.rv_values
+    ):
+        return None  # pragma: no cover
+
+    exception_type = ExceptionType()
+    new_op = MeasurableCheckAndRaise(exc_type=exception_type)
+    # Make base_var unmeasurable
+    unmeasurable_base_rv = ignore_logprob(base_rv)
+    new_rv = new_op.make_node(unmeasurable_base_rv, *conds).default_output()
+    new_rv.name = rv.name
+
+    return [new_rv]
+
+
+measurable_ir_rewrites_db.register(
+    "find_measurable_asserts",
+    find_measurable_asserts,
+    "basic",
+    "assert",
+)
diff --git a/pymc/logprob/cumsum.py b/pymc/logprob/cumsum.py
index 5d46e96e80..1ae8a0b60c 100644
--- a/pymc/logprob/cumsum.py
+++ b/pymc/logprob/cumsum.py
@@ -36,18 +36,14 @@
 
 from typing import List, Optional
 
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.rewriting.basic import node_rewriter
 from pytensor.tensor.extra_ops import CumOp
 
-from pymc.logprob.abstract import (
-    MeasurableVariable,
-    _logprob,
-    assign_custom_measurable_outputs,
-    logprob,
-)
+from pymc.logprob.abstract import MeasurableVariable, _logprob, _logprob_helper
 from pymc.logprob.rewriting import PreserveRVMappings, measurable_ir_rewrites_db
+from pymc.logprob.utils import ignore_logprob
 
 
 class MeasurableCumsum(CumOp):
@@ -62,13 +58,13 @@ def logprob_cumsum(op, values, base_rv, **kwargs):
     """Compute the log-likelihood graph for a `Cumsum`."""
     (value,) = values
 
-    value_diff = at.diff(value, axis=op.axis)
-    value_diff = at.concatenate(
+    value_diff = pt.diff(value, axis=op.axis)
+    value_diff = pt.concatenate(
         (
             # Take first element of axis and add a broadcastable dimension so
             # that it can be concatenated with the rest of value_diff
-            at.shape_padaxis(
-                at.take(value, 0, axis=op.axis),
+            pt.shape_padaxis(
+                pt.take(value, 0, axis=op.axis),
                 axis=op.axis,
             ),
             value_diff,
@@ -76,7 +72,7 @@ def logprob_cumsum(op, values, base_rv, **kwargs):
         axis=op.axis,
     )
 
-    cumsum_logp = logprob(base_rv, value_diff)
+    cumsum_logp = _logprob_helper(base_rv, value_diff)
 
     return cumsum_logp
 
@@ -112,7 +108,7 @@ def find_measurable_cumsums(fgraph, node) -> Optional[List[MeasurableCumsum]]:
 
     new_op = MeasurableCumsum(axis=node.op.axis or 0, mode="add")
     # Make base_var unmeasurable
-    unmeasurable_base_rv = assign_custom_measurable_outputs(base_rv.owner)
+    unmeasurable_base_rv = ignore_logprob(base_rv)
     new_rv = new_op.make_node(unmeasurable_base_rv).default_output()
     new_rv.name = rv.name
 
diff --git a/pymc/logprob/mixture.py b/pymc/logprob/mixture.py
index 5ee35968ef..af331ce637 100644
--- a/pymc/logprob/mixture.py
+++ b/pymc/logprob/mixture.py
@@ -37,7 +37,7 @@
 from typing import List, Optional, Tuple, Union, cast
 
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.basic import Apply, Constant, Variable
 from pytensor.graph.fg import FunctionGraph
@@ -47,7 +47,7 @@
     node_rewriter,
     pre_greedy_node_rewriter,
 )
-from pytensor.ifelse import ifelse
+from pytensor.ifelse import IfElse, ifelse
 from pytensor.scalar.basic import Switch
 from pytensor.tensor.basic import Join, MakeVector
 from pytensor.tensor.elemwise import Elemwise
@@ -69,18 +69,15 @@
 from pytensor.tensor.type_other import NoneConst, NoneTypeT, SliceConstant, SliceType
 from pytensor.tensor.var import TensorVariable
 
-from pymc.logprob.abstract import (
-    MeasurableVariable,
-    _logprob,
-    assign_custom_measurable_outputs,
-    logprob,
-)
+from pymc.logprob.abstract import MeasurableVariable, _logprob, _logprob_helper
 from pymc.logprob.rewriting import (
     local_lift_DiracDelta,
     logprob_rewrites_db,
+    measurable_ir_rewrites_db,
     subtensor_ops,
 )
 from pymc.logprob.tensor import naive_bcast_rv_lift
+from pymc.logprob.utils import ignore_logprob, ignore_logprob_multiple_vars
 
 
 def is_newaxis(x):
@@ -141,7 +138,7 @@ def expand_indices(
         if not is_basic_idx(idx):
             s = shape_copy.pop(0)
 
-            idx = at.as_tensor(idx)
+            idx = pt.as_tensor(idx)
 
             if moved_subspace:
                 # The subspace generated by advanced indices appear as the
@@ -169,7 +166,7 @@ def expand_indices(
             if isinstance(idx, slice) or isinstance(getattr(idx, "type", None), SliceType):
                 idx = as_index_literal(idx)
                 idx_slice, _ = get_canonical_form_slice(idx, s)
-                idx = at.arange(idx_slice.start, idx_slice.stop, idx_slice.step)
+                idx = pt.arange(idx_slice.start, idx_slice.stop, idx_slice.step)
 
             if moved_subspace:
                 # Basic indices appear in the lower dimensions
@@ -202,7 +199,7 @@ def expand_indices(
 
         adv_indices.append(expanded_idx)
 
-    return cast(Tuple[TensorVariable], tuple(at.broadcast_arrays(*adv_indices)))
+    return cast(Tuple[TensorVariable], tuple(pt.broadcast_arrays(*adv_indices)))
 
 
 def rv_pull_down(x: TensorVariable, dont_touch_vars=None) -> TensorVariable:
@@ -267,7 +264,7 @@ def get_stack_mixture_vars(
         join_axis = joined_rvs.owner.inputs[0]
         # TODO: Support symbolic join axes. This will raise ValueError if it's not a constant
         (join_axis,) = constant_fold((join_axis,), raise_not_constant=False)
-        join_axis = at.as_tensor(join_axis, dtype="int64")
+        join_axis = pt.as_tensor(join_axis, dtype="int64")
 
         mixture_rvs = joined_rvs.owner.inputs[1:]
 
@@ -328,9 +325,7 @@ def mixture_replace(fgraph, node):
         # We create custom types for the mixture components and assign them
         # null `get_measurable_outputs` dispatches so that they aren't
         # erroneously encountered in places like `factorized_joint_logprob`.
-        new_node = assign_custom_measurable_outputs(component_rv.owner)
-        out_idx = component_rv.owner.outputs.index(component_rv)
-        new_comp_rv = new_node.outputs[out_idx]
+        new_comp_rv = ignore_logprob(component_rv)
         new_mixture_rvs.append(new_comp_rv)
 
     # Replace this sub-graph with a `MixtureRV`
@@ -379,9 +374,7 @@ def switch_mixture_replace(fgraph, node):
             and component_rv not in rv_map_feature.rv_values
         ):
             return None
-        new_node = assign_custom_measurable_outputs(component_rv.owner)
-        out_idx = component_rv.owner.outputs.index(component_rv)
-        new_comp_rv = new_node.outputs[out_idx]
+        new_comp_rv = ignore_logprob(component_rv)
         mixture_rvs.append(new_comp_rv)
 
     mix_op = MixtureRV(
@@ -437,10 +430,10 @@ def logprob_MixtureRV(
 
         bcast_indices = expand_indices(indices, original_shape)
 
-        logp_val = at.empty(bcast_indices[0].shape)
+        logp_val = pt.empty(bcast_indices[0].shape)
 
         for m, rv in enumerate(comp_rvs):
-            idx_m_on_axis = at.nonzero(at.eq(bcast_indices[join_axis_val], m))
+            idx_m_on_axis = pt.nonzero(pt.eq(bcast_indices[join_axis_val], m))
             m_indices = tuple(
                 v[idx_m_on_axis] for i, v in enumerate(bcast_indices) if i != join_axis_val
             )
@@ -452,8 +445,8 @@ def logprob_MixtureRV(
             # this intentional one-off?
             rv_m = rv_pull_down(rv[m_indices] if m_indices else rv)
             val_m = value[idx_m_on_axis]
-            logp_m = logprob(rv_m, val_m)
-            logp_val = at.set_subtensor(logp_val[idx_m_on_axis], logp_m)
+            logp_m = _logprob_helper(rv_m, val_m)
+            logp_val = pt.set_subtensor(logp_val[idx_m_on_axis], logp_m)
 
     else:
         # FIXME: This logprob implementation does not support mixing across distinct components,
@@ -466,17 +459,17 @@ def logprob_MixtureRV(
         join_axis_val = None if isinstance(join_axis.type, NoneTypeT) else join_axis.data
 
         if join_axis_val is not None:
-            value = at.expand_dims(value, axis=join_axis_val)
+            value = pt.expand_dims(value, axis=join_axis_val)
 
         logp_val = 0.0
         for i, comp_rv in enumerate(comp_rvs):
-            comp_logp = logprob(comp_rv, value)
+            comp_logp = _logprob_helper(comp_rv, value)
             if join_axis_val is not None:
-                comp_logp = at.squeeze(comp_logp, axis=join_axis_val)
+                comp_logp = pt.squeeze(comp_logp, axis=join_axis_val)
             logp_val += ifelse(
-                at.eq(indices[0], i),
+                pt.eq(indices[0], i),
                 comp_logp,
-                at.zeros_like(comp_logp),
+                pt.zeros_like(comp_logp),
             )
 
     return logp_val
@@ -491,3 +484,71 @@ def logprob_MixtureRV(
     "basic",
     "mixture",
 )
+
+
+class MeasurableIfElse(IfElse):
+    """Measurable subclass of IfElse operator."""
+
+
+MeasurableVariable.register(MeasurableIfElse)
+
+
+@node_rewriter([IfElse])
+def find_measurable_ifelse_mixture(fgraph, node):
+    rv_map_feature = getattr(fgraph, "preserve_rv_mappings", None)
+
+    if rv_map_feature is None:
+        return None  # pragma: no cover
+
+    if isinstance(node.op, MeasurableIfElse):
+        return None
+
+    # Check if all components are unvalued measuarable variables
+    if_var, *base_rvs = node.inputs
+
+    if not all(
+        (
+            rv.owner is not None
+            and isinstance(rv.owner.op, MeasurableVariable)
+            and rv not in rv_map_feature.rv_values
+        )
+        for rv in base_rvs
+    ):
+        return None  # pragma: no cover
+
+    unmeasurable_base_rvs = ignore_logprob_multiple_vars(base_rvs, rv_map_feature.rv_values)
+
+    return MeasurableIfElse(n_outs=node.op.n_outs).make_node(if_var, *unmeasurable_base_rvs).outputs
+
+
+measurable_ir_rewrites_db.register(
+    "find_measurable_ifelse_mixture",
+    find_measurable_ifelse_mixture,
+    "basic",
+    "mixture",
+)
+
+
+@_logprob.register(MeasurableIfElse)
+def logprob_ifelse(op, values, if_var, *base_rvs, **kwargs):
+    """Compute the log-likelihood graph for an `IfElse`."""
+    from pymc.pytensorf import replace_rvs_by_values
+
+    assert len(values) * 2 == len(base_rvs)
+
+    rvs_to_values_then = {then_rv: value for then_rv, value in zip(base_rvs[: len(values)], values)}
+    rvs_to_values_else = {else_rv: value for else_rv, value in zip(base_rvs[len(values) :], values)}
+
+    logps_then = [
+        _logprob_helper(rv_then, value, **kwargs) for rv_then, value in rvs_to_values_then.items()
+    ]
+    logps_else = [
+        _logprob_helper(rv_else, value, **kwargs) for rv_else, value in rvs_to_values_else.items()
+    ]
+
+    # If the multiple variables depend on each other, we have to replace them
+    # by the respective values
+    logps_then = replace_rvs_by_values(logps_then, rvs_to_values=rvs_to_values_then)
+    logps_else = replace_rvs_by_values(logps_else, rvs_to_values=rvs_to_values_else)
+
+    return ifelse(if_var, logps_then, logps_else)
diff --git a/pymc/logprob/rewriting.py b/pymc/logprob/rewriting.py
index dc13fdfb7e..70279efda5 100644
--- a/pymc/logprob/rewriting.py
+++ b/pymc/logprob/rewriting.py
@@ -36,7 +36,7 @@
 
 from typing import Dict, Optional, Tuple
 
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.compile.mode import optdb
 from pytensor.graph.basic import Variable
@@ -208,7 +208,7 @@ def incsubtensor_rv_replace(fgraph, node):
 
     To compute the log-probability of a statement like ``Y[idx] = data``, we must
     first realize that our objective is equivalent to computing ``logprob(Y, z)``,
-    where ``z = at.set_subtensor(y[idx], data)`` and ``y`` is the value variable
+    where ``z = pt.set_subtensor(y[idx], data)`` and ``y`` is the value variable
     for ``Y``.
 
     In other words, the log-probability for an `*IncSubtensor*` is the log-probability
@@ -243,7 +243,7 @@ def incsubtensor_rv_replace(fgraph, node):
 
     # Create a new value variable with the indices `idx` set to `data`
     value_var = rv_map_feature.rv_values[rv_var]
-    new_value_var = at.set_subtensor(value_var[idx], data)
+    new_value_var = pt.set_subtensor(value_var[idx], data)
     rv_map_feature.update_rv_maps(rv_var, new_value_var, base_rv_var)
 
     # Return the `RandomVariable` being indexed
diff --git a/pymc/logprob/scan.py b/pymc/logprob/scan.py
index d9eb85a838..aaa2d69f73 100644
--- a/pymc/logprob/scan.py
+++ b/pymc/logprob/scan.py
@@ -39,10 +39,9 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.basic import Variable
-from pytensor.graph.fg import FunctionGraph
 from pytensor.graph.op import compute_test_value
 from pytensor.graph.rewriting.basic import node_rewriter
 from pytensor.graph.rewriting.db import RewriteDatabaseQuery
@@ -54,18 +53,28 @@
 from pytensor.tensor.var import TensorVariable
 from pytensor.updates import OrderedUpdates
 
-from pymc.logprob.abstract import MeasurableVariable, _get_measurable_outputs, _logprob
-from pymc.logprob.joint_logprob import factorized_joint_logprob
+from pymc.logprob.abstract import (
+    MeasurableVariable,
+    _get_measurable_outputs,
+    _logprob,
+    get_measurable_outputs,
+)
+from pymc.logprob.basic import factorized_joint_logprob
 from pymc.logprob.rewriting import (
+    construct_ir_fgraph,
     inc_subtensor_ops,
     logprob_rewrites_db,
     measurable_ir_rewrites_db,
 )
+from pymc.pytensorf import replace_rvs_by_values
 
 
 class MeasurableScan(Scan):
     """A placeholder used to specify a log-likelihood for a scan sub-graph."""
 
+    def __str__(self):
+        return f"Measurable({super().__str__()})"
+
 
 MeasurableVariable.register(MeasurableScan)
 
@@ -213,7 +222,7 @@ def remove(x, i):
         # slices of the actual outer-inputs (e.g. `out[1:]` instead of `out`
         # when `taps=[-1]`).
         var_slices = [new_outer_input_vars[oo_var][b:e] for b, e in slice_seqs]
-        n_steps = at.min([at.shape(n)[0] for n in var_slices])
+        n_steps = pt.min([pt.shape(n)[0] for n in var_slices])
 
         output_scan_args.n_steps = n_steps
 
@@ -239,9 +248,27 @@ def remove(x, i):
     new_inner_out_nit_sot = tuple(output_scan_args.inner_out_nit_sot) + tuple(
         inner_out_fn(remapped_io_to_ii)
     )
-
     output_scan_args.inner_out_nit_sot = list(new_inner_out_nit_sot)
 
+    # Finally, we need to replace any lingering references to the new
+    # internal variables that could be in the recurrent states needed
+    # to compute the new nit_sots
+    traced_outs = (
+        output_scan_args.inner_out_mit_sot
+        + output_scan_args.inner_out_sit_sot
+        + output_scan_args.inner_out_nit_sot
+    )
+    traced_outs = replace_rvs_by_values(traced_outs, rvs_to_values=remapped_io_to_ii)
+    # Update output mappings
+    n_mit_sot = len(output_scan_args.inner_out_mit_sot)
+    output_scan_args.inner_out_mit_sot = traced_outs[:n_mit_sot]
+    offset = n_mit_sot
+    n_sit_sot = len(output_scan_args.inner_out_sit_sot)
+    output_scan_args.inner_out_sit_sot = traced_outs[offset : offset + n_sit_sot]
+    offset += n_sit_sot
+    n_nit_sot = len(output_scan_args.inner_out_nit_sot)
+    output_scan_args.inner_out_nit_sot = traced_outs[offset : offset + n_nit_sot]
+
     return output_scan_args
 
 
@@ -321,7 +348,12 @@ def create_inner_out_logp(value_map: Dict[TensorVariable, TensorVariable]) -> Te
     for key, value in updates.items():
         key.default_update = value
 
-    return logp_scan_out
+    # Return only the logp outputs, not any potentially carried states
+    logp_outputs = logp_scan_out[-len(values) :]
+
+    if len(logp_outputs) == 1:
+        return logp_outputs[0]
+    return logp_outputs
 
 
 @node_rewriter([Scan])
@@ -359,6 +391,12 @@ def find_measurable_scans(fgraph, node):
     )
     for n in local_fgraph_topo:
         if isinstance(n.op, MeasurableVariable):
+            measurable_outputs = get_measurable_outputs(n.op, n)
+            # This variable's source of measure is used by another inner node,
+            # So we don't need it to be an output!
+            if not measurable_outputs:
+                continue
+
             non_output_node_clients = [
                 c for c in clients[n] if c not in curr_scanargs.inner_outputs
             ]
@@ -429,7 +467,7 @@ def find_measurable_scans(fgraph, node):
             full_out_shape = tuple(
                 fgraph.shape_feature.get_shape(full_out, i) for i in range(full_out.ndim)
             )
-            new_val_var = at.empty(full_out_shape, dtype=full_out.dtype)
+            new_val_var = pt.empty(full_out_shape, dtype=full_out.dtype)
 
             # Set the parts of this new value variable that applied to the
             # user-specified value variable to the user's value variable
@@ -439,7 +477,7 @@ def find_measurable_scans(fgraph, node):
             # E.g. for a single `-1` TAPS, `s_0T[1:] = s_1T` where `s_0T` is
             # `new_val_var` and `s_1T` is the user-specified value variable
             # that only spans times `t=1` to `t=T`.
-            new_val_var = at.set_subtensor(new_val_var[subtensor_indices], val_var)
+            new_val_var = pt.set_subtensor(new_val_var[subtensor_indices], val_var)
 
             # This is the outer-input that sets `s_0T[i] = taps[i]` where `i`
             # is a TAP index (e.g. a TAP of `-1` maps to index `0` in a vector
@@ -488,15 +526,9 @@ def add_opts_to_inner_graphs(fgraph, node):
     if getattr(node.op.mode, "had_logprob_rewrites", False):
         return None
 
-    inner_fgraph = FunctionGraph(
-        node.op.inner_inputs,
-        node.op.inner_outputs,
-        clone=True,
-        copy_inputs=False,
-        copy_orphans=False,
-    )
-
-    logprob_rewrites_db.query(RewriteDatabaseQuery(include=["basic"])).rewrite(inner_fgraph)
+    inner_rv_values = {out: out.type() for out in node.op.inner_outputs}
+    ir_rewriter = logprob_rewrites_db.query(RewriteDatabaseQuery(include=["basic"]))
+    inner_fgraph, rv_values, _ = construct_ir_fgraph(inner_rv_values, ir_rewriter=ir_rewriter)
 
     new_outputs = list(inner_fgraph.outputs)
 
@@ -511,11 +543,23 @@ def add_opts_to_inner_graphs(fgraph, node):
 
 
 @_get_measurable_outputs.register(MeasurableScan)
-def _get_measurable_outputs_MeasurableScan(op, node):
-    # TODO: This should probably use `get_random_outer_outputs`
-    # scan_args = ScanArgs.from_node(node)
-    # rv_outer_outs = get_random_outer_outputs(scan_args)
-    return [o for o in node.outputs if not isinstance(o.type, RandomType)]
+def _get_measurable_outputs_MeasurableScan(op: Scan, node):
+    """Collect measurable outputs for Measurable Scans"""
+    inner_out_from_outer_out_map = op.get_oinp_iinp_iout_oout_mappings()["inner_out_from_outer_out"]
+    inner_outs = op.inner_outputs
+
+    # Measurable scan outputs are those whose inner scan output counterparts are also measurable
+    measurable_outputs = []
+    for out_idx, out in enumerate(node.outputs):
+        [inner_out_idx] = inner_out_from_outer_out_map[out_idx]
+        inner_out = inner_outs[inner_out_idx]
+        inner_out_node = inner_out.owner
+        if isinstance(
+            inner_out_node.op, MeasurableVariable
+        ) and inner_out in get_measurable_outputs(inner_out_node.op, inner_out_node):
+            measurable_outputs.append(out)
+
+    return measurable_outputs
 
 
 measurable_ir_rewrites_db.register(
diff --git a/pymc/logprob/tensor.py b/pymc/logprob/tensor.py
index 33bab00466..6ca11b65f4 100644
--- a/pymc/logprob/tensor.py
+++ b/pymc/logprob/tensor.py
@@ -38,7 +38,7 @@
 
 import pytensor
 
-from pytensor import tensor as at
+from pytensor import tensor as pt
 from pytensor.graph.op import compute_test_value
 from pytensor.graph.rewriting.basic import node_rewriter
 from pytensor.tensor.basic import Join, MakeVector
@@ -50,13 +50,9 @@
     local_rv_size_lift,
 )
 
-from pymc.logprob.abstract import (
-    MeasurableVariable,
-    _logprob,
-    assign_custom_measurable_outputs,
-    logprob,
-)
+from pymc.logprob.abstract import MeasurableVariable, _logprob, _logprob_helper
 from pymc.logprob.rewriting import PreserveRVMappings, measurable_ir_rewrites_db
+from pymc.logprob.utils import ignore_logprob, ignore_logprob_multiple_vars
 
 
 @node_rewriter([BroadcastTo])
@@ -107,9 +103,9 @@ def naive_bcast_rv_lift(fgraph, node):
     rng, size, dtype, *dist_params = lifted_node.inputs
 
     new_dist_params = [
-        at.broadcast_to(
+        pt.broadcast_to(
             param,
-            at.broadcast_shape(tuple(param.shape), tuple(bcast_shape), arrays_are_shapes=True),
+            pt.broadcast_shape(tuple(param.shape), tuple(bcast_shape), arrays_are_shapes=True),
         )
         for param in dist_params
     ]
@@ -141,12 +137,12 @@ def logprob_make_vector(op, values, *base_rvs, **kwargs):
         base_rv.name = f"base_rv[{i}]"
         value.name = f"value[{i}]"
 
-    logps = [logprob(base_rv, value) for base_rv, value in base_rvs_to_values.items()]
+    logps = [_logprob_helper(base_rv, value) for base_rv, value in base_rvs_to_values.items()]
 
     # If the stacked variables depend on each other, we have to replace them by the respective values
     logps = replace_rvs_by_values(logps, rvs_to_values=base_rvs_to_values)
 
-    return at.stack(logps)
+    return pt.stack(logps)
 
 
 class MeasurableJoin(Join):
@@ -169,7 +165,7 @@ def logprob_join(op, values, axis, *base_rvs, **kwargs):
     # We don't need the graph to be constant, just to have RandomVariables removed
     base_rv_shapes = constant_fold(base_rv_shapes, raise_not_constant=False)
 
-    split_values = at.split(
+    split_values = pt.split(
         value,
         splits_size=base_rv_shapes,
         n_splits=len(base_rvs),
@@ -178,7 +174,8 @@ def logprob_join(op, values, axis, *base_rvs, **kwargs):
 
     base_rvs_to_split_values = {base_rv: value for base_rv, value in zip(base_rvs, split_values)}
     logps = [
-        logprob(base_var, split_value) for base_var, split_value in base_rvs_to_split_values.items()
+        _logprob_helper(base_var, split_value)
+        for base_var, split_value in base_rvs_to_split_values.items()
     ]
 
     if len({logp.ndim for logp in logps}) != 1:
@@ -191,8 +188,8 @@ def logprob_join(op, values, axis, *base_rvs, **kwargs):
     logps = replace_rvs_by_values(logps, rvs_to_values=base_rvs_to_split_values)
 
     base_vars_ndim_supp = split_values[0].ndim - logps[0].ndim
-    join_logprob = at.concatenate(
-        [at.atleast_1d(logp) for logp in logps],
+    join_logprob = pt.concatenate(
+        [pt.atleast_1d(logp) for logp in logps],
         axis=axis - base_vars_ndim_supp,
     )
 
@@ -232,30 +229,7 @@ def find_measurable_stacks(
     ):
         return None  # pragma: no cover
 
-    # Make base_vars unmeasurable
-    base_to_unmeasurable_vars = {
-        base_var: assign_custom_measurable_outputs(base_var.owner).outputs[
-            base_var.owner.outputs.index(base_var)
-        ]
-        for base_var in base_vars
-    }
-
-    def replacement_fn(var, replacements):
-        if var in base_to_unmeasurable_vars:
-            replacements[var] = base_to_unmeasurable_vars[var]
-        # We don't want to clone valued nodes. Assigning a var to itself in the
-        # replacements prevents this
-        elif var in rvs_to_values:
-            replacements[var] = var
-
-        return []
-
-    # TODO: Fix this import circularity!
-    from pymc.pytensorf import _replace_rvs_in_graphs
-
-    unmeasurable_base_vars, _ = _replace_rvs_in_graphs(
-        graphs=base_vars, replacement_fn=replacement_fn
-    )
+    unmeasurable_base_vars = ignore_logprob_multiple_vars(base_vars, rvs_to_values)
 
     if is_join:
         measurable_stack = MeasurableJoin()(axis, *unmeasurable_base_vars)
@@ -298,7 +272,7 @@ def logprob_dimshuffle(op, values, base_var, **kwargs):
     undo_ds = [original_shuffle.index(i) for i in range(len(original_shuffle))]
     value = value.dimshuffle(undo_ds)
 
-    raw_logp = logprob(base_var, value)
+    raw_logp = _logprob_helper(base_var, value)
 
     # Re-apply original dimshuffle, ignoring any support dimensions consumed by
     # the logprob function. This assumes that support dimensions are always in
@@ -339,7 +313,7 @@ def find_measurable_dimshuffles(fgraph, node) -> Optional[List[MeasurableDimShuf
         return None  # pragma: no cover
 
     # Make base_vars unmeasurable
-    base_var = assign_custom_measurable_outputs(base_var.owner)
+    base_var = ignore_logprob(base_var)
 
     measurable_dimshuffle = MeasurableDimShuffle(node.op.input_broadcastable, node.op.new_order)(
         base_var
diff --git a/pymc/logprob/transforms.py b/pymc/logprob/transforms.py
index 5f3f320be5..008be0b731 100644
--- a/pymc/logprob/transforms.py
+++ b/pymc/logprob/transforms.py
@@ -40,7 +40,7 @@
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.gradient import DisconnectedType, jacobian
 from pytensor.graph.basic import Apply, Node, Variable
@@ -77,12 +77,15 @@
     MeasurableElemwise,
     MeasurableVariable,
     _get_measurable_outputs,
+    _icdf,
+    _icdf_helper,
+    _logcdf,
+    _logcdf_helper,
     _logprob,
-    assign_custom_measurable_outputs,
-    logprob,
+    _logprob_helper,
 )
 from pymc.logprob.rewriting import PreserveRVMappings, measurable_ir_rewrites_db
-from pymc.logprob.utils import walk_model
+from pymc.logprob.utils import ignore_logprob, walk_model
 
 
 class TransformedVariable(Op):
@@ -132,12 +135,12 @@ def backward(
 
     def log_jac_det(self, value: TensorVariable, *inputs) -> TensorVariable:
         """Construct the log of the absolute value of the Jacobian determinant."""
-        # jac = at.reshape(
-        #     gradient(at.sum(self.backward(value, *inputs)), [value]), value.shape
+        # jac = pt.reshape(
+        #     gradient(pt.sum(self.backward(value, *inputs)), [value]), value.shape
         # )
-        # return at.log(at.abs(jac))
+        # return pt.log(pt.abs(jac))
         phi_inv = self.backward(value, *inputs)
-        return at.log(at.abs(at.nlinalg.det(at.atleast_2d(jacobian(phi_inv, [value])[0]))))
+        return pt.log(pt.abs(pt.nlinalg.det(pt.atleast_2d(jacobian(phi_inv, [value])[0]))))
 
 
 @node_rewriter(tracks=None)
@@ -237,7 +240,7 @@ def transform_scan_values(fgraph: FunctionGraph, node: Node) -> Optional[List[No
         return None
 
     transforms = [
-        values_to_transforms.get(rv_map_feature.original_values[value], None)
+        values_to_transforms.get(rv_map_feature.original_values[value_var], None)
         for value_var in value_vars
     ]
 
@@ -369,20 +372,58 @@ def measurable_transform_logprob(op: MeasurableTransform, values, *inputs, **kwa
 
     # Some transformations, like squaring may produce multiple backward values
     if isinstance(backward_value, tuple):
-        input_logprob = at.logaddexp(
-            *(logprob(measurable_input, backward_val, **kwargs) for backward_val in backward_value)
+        input_logprob = pt.logaddexp(
+            *(
+                _logprob_helper(measurable_input, backward_val, **kwargs)
+                for backward_val in backward_value
+            )
         )
     else:
-        input_logprob = logprob(measurable_input, backward_value)
+        input_logprob = _logprob_helper(measurable_input, backward_value)
+
+    jacobian = op.transform_elemwise.log_jac_det(value, *other_inputs)
 
     if input_logprob.ndim < value.ndim:
-        # Do we just need to sum the jacobian terms across the support dims?
-        raise NotImplementedError("Transform of multivariate RVs not implemented")
+        # For multivariate variables, the Jacobian is diagonal.
+        # We can get the right result by summing the last dimensions
+        # of `transform_elemwise.log_jac_det`
+        ndim_supp = value.ndim - input_logprob.ndim
+        jacobian = jacobian.sum(axis=tuple(range(-ndim_supp, 0)))
 
-    jacobian = op.transform_elemwise.log_jac_det(value, *other_inputs)
+    # The jacobian is used to ensure a value in the supported domain was provided
+    return pt.switch(pt.isnan(jacobian), -np.inf, input_logprob + jacobian)
+
+
+@_logcdf.register(MeasurableTransform)
+def measurable_transform_logcdf(op: MeasurableTransform, value, *inputs, **kwargs):
+    """Compute the log-CDF graph for a `MeasurabeTransform`."""
+    other_inputs = list(inputs)
+    measurable_input = other_inputs.pop(op.measurable_input_idx)
+
+    backward_value = op.transform_elemwise.backward(value, *other_inputs)
+
+    # Some transformations, like squaring may produce multiple backward values
+    if isinstance(backward_value, tuple):
+        raise NotImplementedError
+
+    input_logcdf = _logcdf_helper(measurable_input, backward_value)
 
     # The jacobian is used to ensure a value in the supported domain was provided
-    return at.switch(at.isnan(jacobian), -np.inf, input_logprob + jacobian)
+    jacobian = op.transform_elemwise.log_jac_det(value, *other_inputs)
+
+    return pt.switch(pt.isnan(jacobian), -np.inf, input_logcdf)
+
+
+@_icdf.register(MeasurableTransform)
+def measurable_transform_icdf(op: MeasurableTransform, value, *inputs, **kwargs):
+    """Compute the inverse CDF graph for a `MeasurabeTransform`."""
+    other_inputs = list(inputs)
+    measurable_input = other_inputs.pop(op.measurable_input_idx)
+
+    input_icdf = _icdf_helper(measurable_input, value)
+    icdf = op.transform_elemwise.forward(input_icdf, *other_inputs)
+
+    return icdf
 
 
 @node_rewriter([reciprocal])
@@ -400,7 +441,7 @@ def measurable_reciprocal_to_power(fgraph, node):
     if inp in rv_map_feature.rv_values:
         return None  # pragma: no cover
 
-    return [at.pow(inp, -1.0)]
+    return [pt.pow(inp, -1.0)]
 
 
 @node_rewriter([sqr, sqrt])
@@ -420,10 +461,10 @@ def measurable_sqrt_sqr_to_power(fgraph, node):
         return None  # pragma: no cover
 
     if isinstance(node.op.scalar_op, Sqr):
-        return [at.pow(inp, 2)]
+        return [pt.pow(inp, 2)]
 
     if isinstance(node.op.scalar_op, Sqrt):
-        return [at.pow(inp, 1 / 2)]
+        return [pt.pow(inp, 1 / 2)]
 
 
 @node_rewriter([true_div])
@@ -448,15 +489,15 @@ def measurable_div_to_product(fgraph, node):
 
     # Check if numerator is 1
     try:
-        if at.get_scalar_constant_value(numerator) == 1:
+        if pt.get_scalar_constant_value(numerator) == 1:
             # We convert the denominator directly to a power transform as this
             # must be the measurable input
-            return [at.pow(denominator, -1)]
+            return [pt.pow(denominator, -1)]
     except NotScalarConstantError:
         pass
     # We don't convert the denominator directly to a power transform as
     # it might not be measurable (and therefore not needed)
-    return [at.mul(numerator, at.reciprocal(denominator))]
+    return [pt.mul(numerator, pt.reciprocal(denominator))]
 
 
 @node_rewriter([neg])
@@ -475,7 +516,7 @@ def measurable_neg_to_product(fgraph, node):
     if inp in rv_map_feature.rv_values:
         return None  # pragma: no cover
 
-    return [at.mul(inp, -1.0)]
+    return [pt.mul(inp, -1.0)]
 
 
 @node_rewriter([sub])
@@ -496,7 +537,7 @@ def measurable_sub_to_neg(fgraph, node):
         return None  # pragma: no cover
 
     minuend, subtrahend = node.inputs
-    return [at.add(minuend, at.neg(subtrahend))]
+    return [pt.add(minuend, pt.neg(subtrahend))]
 
 
 @node_rewriter([exp, log, add, mul, pow, abs])
@@ -549,7 +590,7 @@ def find_measurable_transforms(fgraph: FunctionGraph, node: Node) -> Optional[Li
 
     # Make base_measure outputs unmeasurable
     # This seems to be the only thing preventing nested rewrites from being erased
-    measurable_input = assign_custom_measurable_outputs(measurable_input.owner)
+    measurable_input = ignore_logprob(measurable_input)
 
     scalar_op = node.op.scalar_op
     measurable_input_idx = 0
@@ -567,19 +608,19 @@ def find_measurable_transforms(fgraph: FunctionGraph, node: Node) -> Optional[Li
             return None
         try:
             (power,) = other_inputs
-            power = at.get_scalar_constant_value(power).item()
+            power = pt.get_underlying_scalar_constant_value(power).item()
         # Power needs to be a constant
         except NotScalarConstantError:
             return None
         transform_inputs = (measurable_input, power)
         transform = PowerTransform(power=power)
     elif isinstance(scalar_op, Add):
-        transform_inputs = (measurable_input, at.add(*other_inputs))
+        transform_inputs = (measurable_input, pt.add(*other_inputs))
         transform = LocTransform(
             transform_args_fn=lambda *inputs: inputs[-1],
         )
     else:
-        transform_inputs = (measurable_input, at.mul(*other_inputs))
+        transform_inputs = (measurable_input, pt.mul(*other_inputs))
         transform = ScaleTransform(
             transform_args_fn=lambda *inputs: inputs[-1],
         )
@@ -656,7 +697,7 @@ def backward(self, value, *inputs):
         return value - loc
 
     def log_jac_det(self, value, *inputs):
-        return at.zeros_like(value)
+        return pt.zeros_like(value)
 
 
 class ScaleTransform(RVTransform):
@@ -675,17 +716,17 @@ def backward(self, value, *inputs):
 
     def log_jac_det(self, value, *inputs):
         scale = self.transform_args_fn(*inputs)
-        return -at.log(at.abs(scale))
+        return -pt.log(pt.abs(pt.broadcast_to(scale, value.shape)))
 
 
 class LogTransform(RVTransform):
     name = "log"
 
     def forward(self, value, *inputs):
-        return at.log(value)
+        return pt.log(value)
 
     def backward(self, value, *inputs):
-        return at.exp(value)
+        return pt.exp(value)
 
     def log_jac_det(self, value, *inputs):
         return value
@@ -695,27 +736,27 @@ class ExpTransform(RVTransform):
     name = "exp"
 
     def forward(self, value, *inputs):
-        return at.exp(value)
+        return pt.exp(value)
 
     def backward(self, value, *inputs):
-        return at.log(value)
+        return pt.log(value)
 
     def log_jac_det(self, value, *inputs):
-        return -at.log(value)
+        return -pt.log(value)
 
 
 class AbsTransform(RVTransform):
     name = "abs"
 
     def forward(self, value, *inputs):
-        return at.abs(value)
+        return pt.abs(value)
 
     def backward(self, value, *inputs):
-        value = at.switch(value >= 0, value, np.nan)
+        value = pt.switch(value >= 0, value, np.nan)
         return -value, value
 
     def log_jac_det(self, value, *inputs):
-        return at.switch(value >= 0, 0, np.nan)
+        return pt.switch(value >= 0, 0, np.nan)
 
 
 class PowerTransform(RVTransform):
@@ -730,17 +771,17 @@ def __init__(self, power=None):
         super().__init__()
 
     def forward(self, value, *inputs):
-        at.power(value, self.power)
+        pt.power(value, self.power)
 
     def backward(self, value, *inputs):
         inv_power = 1 / self.power
 
         # Powers that don't admit negative values
         if (np.abs(self.power) < 1) or (self.power % 2 == 0):
-            backward_value = at.switch(value >= 0, at.power(value, inv_power), np.nan)
+            backward_value = pt.switch(value >= 0, pt.power(value, inv_power), np.nan)
         # Powers that admit negative values require special logic, because (-1)**(1/3) returns `nan` in PyTensor
         else:
-            backward_value = at.power(at.abs(value), inv_power) * at.switch(value >= 0, 1, -1)
+            backward_value = pt.power(pt.abs(value), inv_power) * pt.switch(value >= 0, 1, -1)
 
         # In this case the transform is not 1-to-1
         if self.power % 2 == 0:
@@ -752,11 +793,11 @@ def log_jac_det(self, value, *inputs):
         inv_power = 1 / self.power
 
         # Note: This fails for value==0
-        res = np.log(np.abs(inv_power)) + (inv_power - 1) * at.log(at.abs(value))
+        res = np.log(np.abs(inv_power)) + (inv_power - 1) * pt.log(pt.abs(value))
 
         # Powers that don't admit negative values
         if (np.abs(self.power) < 1) or (self.power % 2 == 0):
-            res = at.switch(value >= 0, res, np.nan)
+            res = pt.switch(value >= 0, res, np.nan)
 
         return res
 
@@ -780,11 +821,11 @@ def forward(self, value, *inputs):
         a, b = self.args_fn(*inputs)
 
         if a is not None and b is not None:
-            return at.log(value - a) - at.log(b - value)
+            return pt.log(value - a) - pt.log(b - value)
         elif a is not None:
-            return at.log(value - a)
+            return pt.log(value - a)
         elif b is not None:
-            return at.log(b - value)
+            return pt.log(b - value)
         else:
             raise ValueError("Both edges of IntervalTransform cannot be None")
 
@@ -792,12 +833,12 @@ def backward(self, value, *inputs):
         a, b = self.args_fn(*inputs)
 
         if a is not None and b is not None:
-            sigmoid_x = at.sigmoid(value)
+            sigmoid_x = pt.sigmoid(value)
             return sigmoid_x * b + (1 - sigmoid_x) * a
         elif a is not None:
-            return at.exp(value) + a
+            return pt.exp(value) + a
         elif b is not None:
-            return b - at.exp(value)
+            return b - pt.exp(value)
         else:
             raise ValueError("Both edges of IntervalTransform cannot be None")
 
@@ -805,8 +846,8 @@ def log_jac_det(self, value, *inputs):
         a, b = self.args_fn(*inputs)
 
         if a is not None and b is not None:
-            s = at.softplus(-value)
-            return at.log(b - a) - 2 * s - value
+            s = pt.softplus(-value)
+            return pt.log(b - a) - 2 * s - value
         elif a is None and b is None:
             raise ValueError("Both edges of IntervalTransform cannot be None")
         else:
@@ -817,50 +858,50 @@ class LogOddsTransform(RVTransform):
     name = "logodds"
 
     def backward(self, value, *inputs):
-        return at.expit(value)
+        return pt.expit(value)
 
     def forward(self, value, *inputs):
-        return at.log(value / (1 - value))
+        return pt.log(value / (1 - value))
 
     def log_jac_det(self, value, *inputs):
-        sigmoid_value = at.sigmoid(value)
-        return at.log(sigmoid_value) + at.log1p(-sigmoid_value)
+        sigmoid_value = pt.sigmoid(value)
+        return pt.log(sigmoid_value) + pt.log1p(-sigmoid_value)
 
 
 class SimplexTransform(RVTransform):
     name = "simplex"
 
     def forward(self, value, *inputs):
-        log_value = at.log(value)
-        shift = at.sum(log_value, -1, keepdims=True) / value.shape[-1]
+        log_value = pt.log(value)
+        shift = pt.sum(log_value, -1, keepdims=True) / value.shape[-1]
         return log_value[..., :-1] - shift
 
     def backward(self, value, *inputs):
-        value = at.concatenate([value, -at.sum(value, -1, keepdims=True)], axis=-1)
-        exp_value_max = at.exp(value - at.max(value, -1, keepdims=True))
-        return exp_value_max / at.sum(exp_value_max, -1, keepdims=True)
+        value = pt.concatenate([value, -pt.sum(value, -1, keepdims=True)], axis=-1)
+        exp_value_max = pt.exp(value - pt.max(value, -1, keepdims=True))
+        return exp_value_max / pt.sum(exp_value_max, -1, keepdims=True)
 
     def log_jac_det(self, value, *inputs):
         N = value.shape[-1] + 1
-        sum_value = at.sum(value, -1, keepdims=True)
+        sum_value = pt.sum(value, -1, keepdims=True)
         value_sum_expanded = value + sum_value
-        value_sum_expanded = at.concatenate([value_sum_expanded, at.zeros(sum_value.shape)], -1)
-        logsumexp_value_expanded = at.logsumexp(value_sum_expanded, -1, keepdims=True)
-        res = at.log(N) + (N * sum_value) - (N * logsumexp_value_expanded)
-        return at.sum(res, -1)
+        value_sum_expanded = pt.concatenate([value_sum_expanded, pt.zeros(sum_value.shape)], -1)
+        logsumexp_value_expanded = pt.logsumexp(value_sum_expanded, -1, keepdims=True)
+        res = pt.log(N) + (N * sum_value) - (N * logsumexp_value_expanded)
+        return pt.sum(res, -1)
 
 
 class CircularTransform(RVTransform):
     name = "circular"
 
     def backward(self, value, *inputs):
-        return at.arctan2(at.sin(value), at.cos(value))
+        return pt.arctan2(pt.sin(value), pt.cos(value))
 
     def forward(self, value, *inputs):
-        return at.as_tensor_variable(value)
+        return pt.as_tensor_variable(value)
 
     def log_jac_det(self, value, *inputs):
-        return at.zeros(value.shape)
+        return pt.zeros(value.shape)
 
 
 class ChainedTransform(RVTransform):
@@ -881,7 +922,7 @@ def backward(self, value, *inputs):
         return value
 
     def log_jac_det(self, value, *inputs):
-        value = at.as_tensor_variable(value)
+        value = pt.as_tensor_variable(value)
         det_list = []
         ndim0 = value.ndim
         for transform in reversed(self.transform_list):
@@ -893,7 +934,8 @@ def log_jac_det(self, value, *inputs):
         det = 0.0
         for det_ in det_list:
             if det_.ndim > ndim0:
-                det += det_.sum(axis=-1)
+                ndim_diff = det_.ndim - ndim0
+                det += det_.sum(axis=tuple(range(-ndim_diff, 0)))
             else:
                 det += det_
         return det
diff --git a/pymc/logprob/utils.py b/pymc/logprob/utils.py
index b88d56d3ee..18f9b803e7 100644
--- a/pymc/logprob/utils.py
+++ b/pymc/logprob/utils.py
@@ -37,7 +37,17 @@
 import warnings
 
 from copy import copy
-from typing import Callable, Dict, Generator, Iterable, List, Optional, Set, Tuple
+from typing import (
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+)
 
 import numpy as np
 
@@ -210,8 +220,11 @@ class CheckParameterValue(CheckAndRaise):
     Raises `ParameterValueError` if the check is not True.
     """
 
-    def __init__(self, msg=""):
+    __props__ = ("msg", "exc_type", "can_be_replaced_by_ninf")
+
+    def __init__(self, msg: str = "", can_be_replaced_by_ninf: bool = False):
         super().__init__(ParameterValueError, msg)
+        self.can_be_replaced_by_ninf = can_be_replaced_by_ninf
 
     def __str__(self):
         return f"Check{{{self.msg}}}"
@@ -262,7 +275,7 @@ def diracdelta_logprob(op, values, *inputs, **kwargs):
 def ignore_logprob(rv: TensorVariable) -> TensorVariable:
     """Return a duplicated variable that is ignored when creating logprob graphs
 
-    This is used in SymbolicDistributions that use other RVs as inputs but account
+    This is used in by MeasurableRVs that use other RVs as inputs but account
     for their logp terms explicitly.
 
     If the variable is already ignored, it is returned directly.
@@ -272,6 +285,7 @@ def ignore_logprob(rv: TensorVariable) -> TensorVariable:
     op_type = type(node.op)
     if op_type.__name__.startswith(prefix):
         return rv
+    # By default `assign_custom_measurable_outputs` makes all outputs unmeasurable
     new_node = assign_custom_measurable_outputs(node, type_prefix=prefix)
     return new_node.outputs[node.outputs.index(rv)]
 
@@ -294,3 +308,32 @@ def reconsider_logprob(rv: TensorVariable) -> TensorVariable:
     new_node.op = copy(new_node.op)
     new_node.op.__class__ = original_op_type
     return new_node.outputs[node.outputs.index(rv)]
+
+
+def ignore_logprob_multiple_vars(
+    vars: Sequence[TensorVariable], rvs_to_values: Dict[TensorVariable, TensorVariable]
+) -> List[TensorVariable]:
+    """Return duplicated variables that are ignored when creating logprob graphs.
+
+    This function keeps any interdependencies between variables intact, after
+    making each "unmeasurable", whereas a sequential call to `ignore_logprob`
+    would not do this correctly.
+    """
+    from pymc.pytensorf import _replace_rvs_in_graphs
+
+    measurable_vars_to_unmeasurable_vars = {
+        measurable_var: ignore_logprob(measurable_var) for measurable_var in vars
+    }
+
+    def replacement_fn(var, replacements):
+        if var in measurable_vars_to_unmeasurable_vars:
+            replacements[var] = measurable_vars_to_unmeasurable_vars[var]
+        # We don't want to clone valued nodes. Assigning a var to itself in the
+        # replacements prevents this
+        elif var in rvs_to_values:
+            replacements[var] = var
+
+        return []
+
+    unmeasurable_vars, _ = _replace_rvs_in_graphs(graphs=vars, replacement_fn=replacement_fn)
+    return unmeasurable_vars
diff --git a/pymc/math.py b/pymc/math.py
index 9666323bd1..688186c51c 100644
--- a/pymc/math.py
+++ b/pymc/math.py
@@ -20,7 +20,7 @@
 import numpy as np
 import pytensor
 import pytensor.sparse
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytensor.tensor.slinalg  # pylint: disable=unused-import
 import scipy as sp
 import scipy.sparse  # pylint: disable=unused-import
@@ -171,7 +171,7 @@ def kronecker(*Ks):
     np.ndarray :
         Block matrix Kroncker product of the argument matrices.
     """
-    return reduce(at.slinalg.kron, Ks)
+    return reduce(pt.slinalg.kron, Ks)
 
 
 def cartesian(*arrays):
@@ -224,17 +224,17 @@ def kron_vector_op(v):
         raise ValueError(f"m must have ndim <= 2, not {m.ndim}")
     result = kron_vector_op(m)
     result_shape = result.shape
-    return at.reshape(result, (result_shape[1], result_shape[0])).T
+    return pt.reshape(result, (result_shape[1], result_shape[0])).T
 
 
 # Define kronecker functions that work on 1D and 2D arrays
-kron_dot = partial(kron_matrix_op, op=at.dot)
-kron_solve_lower = partial(kron_matrix_op, op=at.slinalg.SolveTriangular(lower=True))
-kron_solve_upper = partial(kron_matrix_op, op=at.slinalg.SolveTriangular(lower=False))
+kron_dot = partial(kron_matrix_op, op=pt.dot)
+kron_solve_lower = partial(kron_matrix_op, op=pt.slinalg.SolveTriangular(lower=True))
+kron_solve_upper = partial(kron_matrix_op, op=pt.slinalg.SolveTriangular(lower=False))
 
 
 def flat_outer(a, b):
-    return at.outer(a, b).ravel()
+    return pt.outer(a, b).ravel()
 
 
 def kron_diag(*diags):
@@ -254,12 +254,12 @@ def tround(*args, **kwargs):
     when the warning disappears.
     """
     kwargs["mode"] = "half_to_even"
-    return at.round(*args, **kwargs)
+    return pt.round(*args, **kwargs)
 
 
 def logdiffexp(a, b):
     """log(exp(a) - exp(b))"""
-    return a + at.log1mexp(b - a)
+    return a + pt.log1mexp(b - a)
 
 
 def logdiffexp_numpy(a, b):
@@ -275,7 +275,7 @@ def invlogit(x, eps=None):
             FutureWarning,
             stacklevel=2,
         )
-    return at.sigmoid(x)
+    return pt.sigmoid(x)
 
 
 def softmax(x, axis=None):
@@ -283,7 +283,7 @@ def softmax(x, axis=None):
     # drops that warning
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", UserWarning)
-        return at.special.softmax(x, axis=axis)
+        return pt.special.softmax(x, axis=axis)
 
 
 def log_softmax(x, axis=None):
@@ -291,7 +291,7 @@ def log_softmax(x, axis=None):
     # drops that warning
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", UserWarning)
-        return at.special.log_softmax(x, axis=axis)
+        return pt.special.log_softmax(x, axis=axis)
 
 
 def logbern(log_p):
@@ -301,7 +301,7 @@ def logbern(log_p):
 
 
 def logit(p):
-    return at.log(p / (floatX(1) - p))
+    return pt.log(p / (floatX(1) - p))
 
 
 def log1mexp(x, *, negative_input=False):
@@ -327,7 +327,7 @@ def log1mexp(x, *, negative_input=False):
         )
         x = -x
 
-    return at.log1mexp(x)
+    return pt.log1mexp(x)
 
 
 def log1mexp_numpy(x, *, negative_input=False):
@@ -356,7 +356,7 @@ def log1mexp_numpy(x, *, negative_input=False):
 
 
 def flatten_list(tensors):
-    return at.concatenate([var.ravel() for var in tensors])
+    return pt.concatenate([var.ravel() for var in tensors])
 
 
 class LogDet(Op):
@@ -441,13 +441,13 @@ def expand_packed_triangular(n, packed, lower=True, diagonal_only=False):
         diag_idxs = np.arange(2, n + 2)[::-1].cumsum() - n - 1
         return packed[diag_idxs]
     elif lower:
-        out = at.zeros((n, n), dtype=pytensor.config.floatX)
+        out = pt.zeros((n, n), dtype=pytensor.config.floatX)
         idxs = np.tril_indices(n)
-        return at.set_subtensor(out[idxs], packed)
+        return pt.set_subtensor(out[idxs], packed)
     elif not lower:
-        out = at.zeros((n, n), dtype=pytensor.config.floatX)
+        out = pt.zeros((n, n), dtype=pytensor.config.floatX)
         idxs = np.triu_indices(n)
-        return at.set_subtensor(out[idxs], packed)
+        return pt.set_subtensor(out[idxs], packed)
 
 
 class BatchedDiag(Op):
@@ -458,11 +458,11 @@ class BatchedDiag(Op):
     __props__ = ()
 
     def make_node(self, diag):
-        diag = at.as_tensor_variable(diag)
+        diag = pt.as_tensor_variable(diag)
         if diag.type.ndim != 2:
             raise TypeError("data argument must be a matrix", diag.type)
 
-        return Apply(self, [diag], [at.tensor3(dtype=diag.dtype)])
+        return Apply(self, [diag], [pt.tensor3(dtype=diag.dtype)])
 
     def perform(self, node, ins, outs, params=None):
         (C,) = ins
@@ -478,7 +478,7 @@ def perform(self, node, ins, outs, params=None):
 
     def grad(self, inputs, gout):
         (gz,) = gout
-        idx = at.arange(gz.shape[-1])
+        idx = pt.arange(gz.shape[-1])
         return [gz[..., idx, idx]]
 
     def infer_shape(self, fgraph, nodes, shapes):
@@ -486,14 +486,14 @@ def infer_shape(self, fgraph, nodes, shapes):
 
 
 def batched_diag(C):
-    C = at.as_tensor(C)
+    C = pt.as_tensor(C)
     dim = C.shape[-1]
     if C.ndim == 2:
         # diag -> matrices
         return BatchedDiag()(C)
     elif C.ndim == 3:
         # matrices -> diag
-        idx = at.arange(dim)
+        idx = pt.arange(dim)
         return C[..., idx, idx]
     else:
         raise ValueError("Input should be 2 or 3 dimensional")
@@ -511,7 +511,7 @@ def __init__(self, sparse=False, format="csr"):
     def make_node(self, *matrices):
         if not matrices:
             raise ValueError("no matrices to allocate")
-        matrices = list(map(at.as_tensor, matrices))
+        matrices = list(map(pt.as_tensor, matrices))
         if any(mat.type.ndim != 2 for mat in matrices):
             raise TypeError("all data arguments must be matrices")
         if self.sparse:
@@ -528,13 +528,13 @@ def perform(self, node, inputs, output_storage, params=None):
             output_storage[0][0] = scipy_block_diag(*inputs).astype(dtype)
 
     def grad(self, inputs, gout):
-        shapes = at.stack([i.shape for i in inputs])
+        shapes = pt.stack([i.shape for i in inputs])
         index_end = shapes.cumsum(0)
         index_begin = index_end - shapes
         slices = [
             ix_(
-                at.arange(index_begin[i, 0], index_end[i, 0]),
-                at.arange(index_begin[i, 1], index_end[i, 1]),
+                pt.arange(index_begin[i, 0], index_end[i, 0]),
+                pt.arange(index_begin[i, 1], index_end[i, 1]),
             )
             for i in range(len(inputs))
         ]
@@ -542,7 +542,7 @@ def grad(self, inputs, gout):
 
     def infer_shape(self, fgraph, nodes, shapes):
         first, second = zip(*shapes)
-        return [(at.add(*first), at.add(*second))]
+        return [(pt.add(*first), pt.add(*second))]
 
 
 def block_diagonal(matrices, sparse=False, format="csr"):
diff --git a/pymc/model.py b/pymc/model.py
index a9e55ea031..90311da95f 100644
--- a/pymc/model.py
+++ b/pymc/model.py
@@ -13,6 +13,7 @@
 #   limitations under the License.
 
 import functools
+import sys
 import threading
 import types
 import warnings
@@ -24,6 +25,7 @@
     Callable,
     Dict,
     List,
+    Literal,
     Optional,
     Sequence,
     Tuple,
@@ -36,9 +38,10 @@
 import numpy as np
 import pytensor
 import pytensor.sparse as sparse
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import scipy.sparse as sps
 
+from pytensor.compile import DeepCopyOp, get_mode
 from pytensor.compile.sharedvalue import SharedVariable
 from pytensor.graph.basic import Constant, Variable, graph_inputs
 from pytensor.graph.fg import FunctionGraph
@@ -46,6 +49,7 @@
 from pytensor.tensor.elemwise import Elemwise
 from pytensor.tensor.random.op import RandomVariable
 from pytensor.tensor.random.rewriting import local_subtensor_rv_lift
+from pytensor.tensor.random.type import RandomType
 from pytensor.tensor.sharedvar import ScalarSharedVariable
 from pytensor.tensor.var import TensorConstant, TensorVariable
 
@@ -60,7 +64,8 @@
     ShapeWarning,
 )
 from pymc.initial_point import make_initial_point_fn
-from pymc.logprob.joint_logprob import joint_logp
+from pymc.logprob.basic import joint_logp
+from pymc.logprob.utils import ParameterValueError
 from pymc.pytensorf import (
     PointFunc,
     SeedSequenceSeed,
@@ -491,7 +496,7 @@ def __init__(self, mean=0, sigma=1, name=''):
                 Deterministic('v3_sq', self.v3 ** 2)
 
                 # Potentials too
-                Potential('p1', at.constant(1))
+                Potential('p1', pt.constant(1))
 
         # After defining a class CustomModel you can use it in several
         # ways
@@ -776,7 +781,7 @@ def logp(
         if not sum:
             return logp_factors
 
-        logp_scalar = at.sum([at.sum(factor) for factor in logp_factors])
+        logp_scalar = pt.sum([pt.sum(factor) for factor in logp_factors])
         logp_scalar_name = "__logp" if jacobian else "__logp_nojac"
         if self.name:
             logp_scalar_name = f"{logp_scalar_name}_{self.name}"
@@ -889,9 +894,9 @@ def potentiallogp(self) -> Variable:
         # inputs and apply their transforms, if any
         potentials = self.replace_rvs_by_values(self.potentials)
         if potentials:
-            return at.sum([at.sum(factor) for factor in potentials])
+            return pt.sum([pt.sum(factor) for factor in potentials])
         else:
-            return at.constant(0.0)
+            return pt.constant(0.0)
 
     @property
     def value_vars(self):
@@ -1438,7 +1443,7 @@ def make_obs_var(
             # values, and another for the non-missing values.
 
             antimask_idx = (~mask).nonzero()
-            nonmissing_data = at.as_tensor_variable(data[antimask_idx])
+            nonmissing_data = pt.as_tensor_variable(data[antimask_idx].data)
             unmasked_rv_var = rv_var[antimask_idx]
             unmasked_rv_var = unmasked_rv_var.owner.clone().default_output()
 
@@ -1461,16 +1466,16 @@ def make_obs_var(
 
             # Create deterministic that combines observed and missing
             # Note: This can widely increase memory consumption during sampling for large datasets
-            rv_var = at.empty(data.shape, dtype=observed_rv_var.type.dtype)
-            rv_var = at.set_subtensor(rv_var[mask.nonzero()], missing_rv_var)
-            rv_var = at.set_subtensor(rv_var[antimask_idx], observed_rv_var)
+            rv_var = pt.empty(data.shape, dtype=observed_rv_var.type.dtype)
+            rv_var = pt.set_subtensor(rv_var[mask.nonzero()], missing_rv_var)
+            rv_var = pt.set_subtensor(rv_var[antimask_idx], observed_rv_var)
             rv_var = Deterministic(name, rv_var, self, dims)
 
         else:
             if sps.issparse(data):
                 data = sparse.basic.as_sparse(data, name=name)
             else:
-                data = at.as_tensor_variable(data, name=name)
+                data = pt.as_tensor_variable(data, name=name)
 
             if total_size:
                 from pymc.variational.minibatch_rv import create_minibatch_rv
@@ -1779,7 +1784,8 @@ def check_start_vals(self, start):
                 raise SamplingError(
                     "Initial evaluation of model at starting point failed!\n"
                     f"Starting values:\n{elem}\n\n"
-                    f"Initial evaluation results:\n{initial_eval}"
+                    f"Logp initial evaluation results:\n{initial_eval}\n"
+                    "You can call `model.debug()` for more details."
                 )
 
     def point_logps(self, point=None, round_vals=2):
@@ -1802,7 +1808,7 @@ def point_logps(self, point=None, round_vals=2):
             point = self.initial_point()
 
         factors = self.basic_RVs + self.potentials
-        factor_logps_fn = [at.sum(factor) for factor in self.logp(factors, sum=False)]
+        factor_logps_fn = [pt.sum(factor) for factor in self.logp(factors, sum=False)]
         return {
             factor.name: np.round(np.asarray(factor_logp), round_vals)
             for factor, factor_logp in zip(
@@ -1811,6 +1817,152 @@ def point_logps(self, point=None, round_vals=2):
             )
         }
 
+    def debug(
+        self,
+        point: Optional[Dict[str, np.ndarray]] = None,
+        fn: Literal["logp", "dlogp", "random"] = "logp",
+        verbose: bool = False,
+    ):
+        """Debug model function at point.
+
+        The method will evaluate the `fn` for each variable at a time.
+        When an evaluation fails or produces a non-finite value we print:
+         1. The graph of the parameters
+         2. The value of the parameters (if those can be evaluated)
+         3. The output of `fn` (if it can be evaluated)
+
+        This function should help to quickly narrow down invalid parametrizations.
+
+        Parameters
+        ----------
+        point : Point
+            Point at which model function should be evaluated
+        fn : str, default "logp"
+            Function to be used for debugging. Can be one of [logp, dlogp, random].
+        verbose : bool, default False
+            Whether to show a more verbose PyTensor output when function cannot be evaluated
+        """
+        print_ = functools.partial(print, file=sys.stdout)
+
+        def first_line(exc):
+            return exc.args[0].split("\n")[0]
+
+        def debug_parameters(rv):
+            if isinstance(rv.owner.op, RandomVariable):
+                inputs = rv.owner.inputs[3:]
+            else:
+                inputs = [inp for inp in rv.owner.inputs if not isinstance(inp.type, RandomType)]
+            rv_inputs = pytensor.function(
+                self.value_vars,
+                self.replace_rvs_by_values(inputs),
+                on_unused_input="ignore",
+                mode=get_mode(None).excluding("inplace", "fusion"),
+            )
+
+            print_(f"The variable {rv} has the following parameters:")
+            # done and used_ids are used to keep the same ids across distinct dprint calls
+            done = {}
+            used_ids = {}
+            for i, out in enumerate(rv_inputs.maker.fgraph.outputs):
+                print_(f"{i}: ", end=""),
+                # Don't print useless deepcopys
+                if out.owner and isinstance(out.owner.op, DeepCopyOp):
+                    out = out.owner.inputs[0]
+                pytensor.dprint(out, print_type=True, done=done, used_ids=used_ids)
+
+            try:
+                print_("The parameters evaluate to:")
+                for i, rv_input_eval in enumerate(rv_inputs(**point)):
+                    print_(f"{i}: {rv_input_eval}")
+            except Exception as exc:
+                print_(
+                    f"The parameters of the variable {rv} cannot be evaluated: {first_line(exc)}"
+                )
+                if verbose:
+                    print_(exc, "\n")
+
+        if fn not in ("logp", "dlogp", "random"):
+            raise ValueError(f"fn must be one of [logp, dlogp, random], got {fn}")
+
+        if point is None:
+            point = self.initial_point()
+        print_(f"point={point}\n")
+
+        rvs_to_check = list(self.basic_RVs)
+        if fn in ("logp", "dlogp"):
+            rvs_to_check += [self.replace_rvs_by_values(p) for p in self.potentials]
+
+        found_problem = False
+        for rv in rvs_to_check:
+            if fn == "logp":
+                rv_fn = pytensor.function(
+                    self.value_vars, self.logp(vars=rv, sum=False)[0], on_unused_input="ignore"
+                )
+            elif fn == "dlogp":
+                rv_fn = pytensor.function(
+                    self.value_vars, self.dlogp(vars=rv), on_unused_input="ignore"
+                )
+            else:
+                [rv_inputs_replaced] = replace_rvs_by_values(
+                    [rv],
+                    # Don't include itself, or the function will just the the value variable
+                    rvs_to_values={
+                        rv_key: value
+                        for rv_key, value in self.rvs_to_values.items()
+                        if rv_key is not rv
+                    },
+                    rvs_to_transforms=self.rvs_to_transforms,
+                )
+                rv_fn = pytensor.function(
+                    self.value_vars, rv_inputs_replaced, on_unused_input="ignore"
+                )
+
+            try:
+                rv_fn_eval = rv_fn(**point)
+            except ParameterValueError as exc:
+                found_problem = True
+                debug_parameters(rv)
+                print_(
+                    f"This does not respect one of the following constraints: {first_line(exc)}\n"
+                )
+                if verbose:
+                    print_(exc)
+            except Exception as exc:
+                found_problem = True
+                debug_parameters(rv)
+                print_(
+                    f"The variable {rv} {fn} method raised the following exception: {first_line(exc)}\n"
+                )
+                if verbose:
+                    print_(exc)
+            else:
+                if not np.all(np.isfinite(rv_fn_eval)):
+                    found_problem = True
+                    debug_parameters(rv)
+                    if fn == "random" or rv is self.potentials:
+                        print_("This combination seems able to generate non-finite values")
+                    else:
+                        # Find which values are associated with non-finite evaluation
+                        values = self.rvs_to_values[rv]
+                        if rv in self.observed_RVs:
+                            values = values.eval()
+                        else:
+                            values = point[values.name]
+
+                        observed = " observed " if rv in self.observed_RVs else " "
+                        print_(
+                            f"Some of the{observed}values of variable {rv} are associated with a non-finite {fn}:"
+                        )
+                        mask = ~np.isfinite(rv_fn_eval)
+                        for value, fn_eval in zip(values[mask], rv_fn_eval[mask]):
+                            print_(f" value = {value} -> {fn} = {fn_eval}")
+                        print_()
+
+        if not found_problem:
+            print_("No problems found")
+        elif not verbose:
+            print_("You can set `verbose=True` for more details")
+
 
 # this is really disgusting, but it breaks a self-loop: I can't pass Model
 # itself as context class init arg.
@@ -1967,6 +2119,25 @@ def Deterministic(name, var, model=None, dims=None):
     they don't add randomness to the model.  They are generally used to record
     an intermediary result.
 
+    Parameters
+    ----------
+    name : str
+        Name of the deterministic variable to be registered in the model.
+    var : tensor_like
+        Expression for the calculation of the variable.
+    model : Model, optional
+        The model object to which the Deterministic variable is added.
+        If ``None`` is provided, the current model in the context stack is used.
+    dims : str or tuple of str, optional
+        Dimension names for the variable.
+
+    Returns
+    -------
+    var : tensor_like
+        The registered, named variable wrapped in Deterministic.
+
+    Examples
+    --------
     Indeed, PyMC allows for arbitrary combinations of random variables, for
     example in the case of a logistic regression
 
@@ -2007,19 +2178,6 @@ def Deterministic(name, var, model=None, dims=None):
     of times during a NUTS step, the Deterministic quantities are just
     computeed once at the end of the step, with the final values of the other
     random variables.
-
-    Parameters
-    ----------
-    name: str
-    var: PyTensor variables
-    auto: bool
-        Add automatically created deterministics (e.g., when imputing missing values)
-        to a separate model.auto_deterministics list for filtering during sampling.
-
-
-    Returns
-    -------
-    var: var, with name attribute
     """
     model = modelcontext(model)
     var = var.copy(model.name_for(name))
@@ -2041,7 +2199,7 @@ def Deterministic(name, var, model=None, dims=None):
     return var
 
 
-def Potential(name, var, model=None):
+def Potential(name, var, model=None, dims=None):
     """
     Add an arbitrary factor potential to the model likelihood
 
@@ -2049,13 +2207,7 @@ def Potential(name, var, model=None):
 
     Warnings
     --------
-    Potential functions only influence logp based sampling, like the one used by ``pm.sample``.
-    Potentials, modify the log-probability of the model by adding a contribution to the logp which is used by sampling algorithms which rely on the information about the observed data to generate posterior samples.
-    Potentials are not applicable in the context of forward sampling because they don't affect the prior distribution itself, only the computation of the logp.
-    Forward sampling algorithms generate sample points from the prior distribution of the model, without taking into account the likelihood function.
-    In other words, it does not use the information about the observed data.
-    Hence, Potentials do not affect forward sampling, which is used by ``sample_prior_predictive`` and ``sample_posterior_predictive``.
-    A warning saying "The effect of Potentials on other parameters is ignored during prior predictive sampling" is always emitted to alert user of this.
+    Potential functions only influence logp-based sampling. Therefore, they are applicable for sampling with ``pm.sample`` but not ``pm.sample_prior_predictive`` or ``pm.sample_posterior_predictive``.
 
     Parameters
     ----------
@@ -2065,7 +2217,9 @@ def Potential(name, var, model=None):
         Expression to be added to the model joint logp.
     model : Model, optional
         The model object to which the potential function is added.
-        If ``None`` is provided, the current model is used.
+        If ``None`` is provided, the current model in the context stack is used.
+    dims : str or tuple of str, optional
+        Dimension names for the variable.
 
     Returns
     -------
@@ -2077,7 +2231,7 @@ def Potential(name, var, model=None):
     Have a look at the following example:
 
     In this example, we define a constraint on ``x`` to be greater or equal to 0 via the ``pm.Potential`` function.
-    We pass ``-pm.math.log(pm.math.switch(constraint, 1, 0))`` as second argument which will return an expression depending on if the constraint is met or not and which will be added to the likelihood of the model.
+    We pass ``pm.math.log(pm.math.switch(constraint, 1, 0))`` as second argument which will return an expression depending on if the constraint is met or not and which will be added to the likelihood of the model.
     The probablity density that this model produces agrees strongly with the constraint that ``x`` should be greater than or equal to 0. All the cases who do not satisfy the constraint are strictly not considered.
 
     .. code:: python
@@ -2086,9 +2240,9 @@ def Potential(name, var, model=None):
             x = pm.Normal("x", mu=0, sigma=1)
             y = pm.Normal("y", mu=x, sigma=1, observed=data)
             constraint = x >= 0
-            potential = pm.Potential("x_constraint", pm.math.log(pm.math.switch(constraint, 1, 0.0)))
+            potential = pm.Potential("x_constraint", pm.math.log(pm.math.switch(constraint, 1, 0)))
 
-    However, if we use ``-pm.math.log(pm.math.switch(constraint, 1, 0.5))`` the potential again penalizes the likelihood when constraint is not met but with some deviations allowed.
+    However, if we use ``pm.math.log(pm.math.switch(constraint, 1.0, 0.5))`` the potential again penalizes the likelihood when constraint is not met but with some deviations allowed.
     Here, Potential function is used to pass a soft constraint.
     A soft constraint is a constraint that is only partially satisfied.
     The effect of this is that the posterior probability for the parameters decreases as they move away from the constraint, but does not become exactly zero.
@@ -2100,7 +2254,7 @@ def Potential(name, var, model=None):
             x = pm.Normal("x", mu=0.1, sigma=1)
             y = pm.Normal("y", mu=x, sigma=1, observed=data)
             constraint = x >= 0
-            potential = pm.Potential("x_constraint", pm.math.log(pm.math.switch(constraint, 1, 0.5)))
+            potential = pm.Potential("x_constraint", pm.math.log(pm.math.switch(constraint, 1.0, 0.5)))
 
     In this example, Potential is used to obtain an arbitrary prior.
     This prior distribution refers to the prior knowledge that the values of ``max_items`` are likely to be small rather than being large.
@@ -2135,7 +2289,7 @@ def Potential(name, var, model=None):
     model = modelcontext(model)
     var.name = model.name_for(name)
     model.potentials.append(var)
-    model.add_named_variable(var)
+    model.add_named_variable(var, dims)
 
     from pymc.printing import str_for_potential_or_deterministic
 
diff --git a/pymc/ode/ode.py b/pymc/ode/ode.py
index db1eefc8da..d92ad2246c 100644
--- a/pymc/ode/ode.py
+++ b/pymc/ode/ode.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import scipy
 
 from pytensor.graph.basic import Apply
@@ -156,8 +156,8 @@ def __call__(self, y0, theta, return_sens=False, **kwargs):
             )
 
         # convert inputs to tensors (and check their types)
-        y0 = at.cast(at.as_tensor_variable(y0), floatX)
-        theta = at.cast(at.as_tensor_variable(theta), floatX)
+        y0 = pt.cast(pt.as_tensor_variable(y0), floatX)
+        theta = pt.cast(pt.as_tensor_variable(theta), floatX)
         inputs = [y0, theta]
         for i, (input_val, itype) in enumerate(zip(inputs, self._itypes)):
             if not itype.is_super(input_val.type):
@@ -237,8 +237,8 @@ def grad(self, inputs, output_grads):
         # for each parameter, multiply sensitivities with the output gradient and sum the result
         # sens is (n_times, n_states, n_p)
         # ograds is (n_times, n_states)
-        grads = [at.sum(sens[:, :, p] * ograds) for p in range(self.n_p)]
+        grads = [pt.sum(sens[:, :, p] * ograds) for p in range(self.n_p)]
 
         # return separate gradient tensors for y0 and theta inputs
-        result = at.stack(grads[: self.n_states]), at.stack(grads[self.n_states :])
+        result = pt.stack(grads[: self.n_states]), pt.stack(grads[self.n_states :])
         return result
diff --git a/pymc/ode/utils.py b/pymc/ode/utils.py
index d2a93792ed..0eccd640e3 100644
--- a/pymc/ode/utils.py
+++ b/pymc/ode/utils.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 
 def make_sens_ic(n_states, n_theta, floatX):
@@ -83,46 +83,46 @@ def augment_system(ode_func, n_states, n_theta):
     """
 
     # Present state of the system
-    t_y = at.vector("y", dtype="float64")
+    t_y = pt.vector("y", dtype="float64")
     t_y.tag.test_value = np.ones((n_states,), dtype="float64")
     # Parameter(s).  Should be vector to allow for generaliztion to multiparameter
     # systems of ODEs.  Is m dimensional because it includes all initial conditions as well as ode parameters
-    t_p = at.vector("p", dtype="float64")
+    t_p = pt.vector("p", dtype="float64")
     t_p.tag.test_value = np.ones((n_states + n_theta,), dtype="float64")
     # Time.  Allow for non-autonomous systems of ODEs to be analyzed
-    t_t = at.scalar("t", dtype="float64")
+    t_t = pt.scalar("t", dtype="float64")
     t_t.tag.test_value = 2.459
 
     # Present state of the gradients:
     # Will always be 0 unless the parameter is the initial condition
     # Entry i,j is partial of y[i] wrt to p[j]
-    dydp_vec = at.vector("dydp", dtype="float64")
+    dydp_vec = pt.vector("dydp", dtype="float64")
     dydp_vec.tag.test_value = make_sens_ic(n_states, n_theta, "float64")
 
     dydp = dydp_vec.reshape((n_states, n_states + n_theta))
 
     # Get symbolic representation of the ODEs by passing tensors for y, t and theta
     yhat = ode_func(t_y, t_t, t_p[n_states:])
-    if isinstance(yhat, at.TensorVariable):
-        t_yhat = at.atleast_1d(yhat)
+    if isinstance(yhat, pt.TensorVariable):
+        t_yhat = pt.atleast_1d(yhat)
     else:
         # Stack the results of the ode_func into a single tensor variable
         if not isinstance(yhat, (list, tuple)):
             raise TypeError(
                 f"Unexpected type, {type(yhat)}, returned by ode_func. TensorVariable, list or tuple is expected."
             )
-        t_yhat = at.stack(yhat, axis=0)
+        t_yhat = pt.stack(yhat, axis=0)
     if t_yhat.ndim > 1:
         raise ValueError(
             f"The odefunc returned a {t_yhat.ndim}-dimensional tensor, but 0 or 1 dimensions were expected."
         )
 
     # Now compute gradients
-    J = at.jacobian(t_yhat, t_y)
+    J = pt.jacobian(t_yhat, t_y)
 
-    Jdfdy = at.dot(J, dydp)
+    Jdfdy = pt.dot(J, dydp)
 
-    grad_f = at.jacobian(t_yhat, t_p)
+    grad_f = pt.jacobian(t_yhat, t_p)
 
     # This is the time derivative of dydp
     ddt_dydp = (Jdfdy + grad_f).flatten()
diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py
index bca1c7bdca..3c9cb945be 100644
--- a/pymc/pytensorf.py
+++ b/pymc/pytensorf.py
@@ -29,7 +29,7 @@
 import numpy as np
 import pandas as pd
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import scipy.sparse as sps
 
 from pytensor import scalar
@@ -42,7 +42,6 @@
     Variable,
     clone_get_equiv,
     graph_inputs,
-    vars_between,
     walk,
 )
 from pytensor.graph.fg import FunctionGraph
@@ -51,6 +50,7 @@
 from pytensor.tensor.basic import _as_tensor_variable
 from pytensor.tensor.elemwise import Elemwise
 from pytensor.tensor.random.op import RandomVariable
+from pytensor.tensor.random.type import RandomType
 from pytensor.tensor.random.var import (
     RandomGeneratorSharedVariable,
     RandomStateSharedVariable,
@@ -147,7 +147,7 @@ def convert_observed_data(data):
 @_as_tensor_variable.register(pd.Series)
 @_as_tensor_variable.register(pd.DataFrame)
 def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable:
-    return at.as_tensor_variable(df.to_numpy(), *args, **kwargs)
+    return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
 
 
 def extract_obs_data(x: TensorVariable) -> np.ndarray:
@@ -470,10 +470,10 @@ def smartfloatX(x):
 
 def gradient1(f, v):
     """flat gradient of f wrt v"""
-    return at.flatten(grad(f, v, disconnected_inputs="warn"))
+    return pt.flatten(grad(f, v, disconnected_inputs="warn"))
 
 
-empty_gradient = at.zeros(0, dtype="float32")
+empty_gradient = pt.zeros(0, dtype="float32")
 
 
 def gradient(f, vars=None):
@@ -481,15 +481,15 @@ def gradient(f, vars=None):
         vars = cont_inputs(f)
 
     if vars:
-        return at.concatenate([gradient1(f, v) for v in vars], axis=0)
+        return pt.concatenate([gradient1(f, v) for v in vars], axis=0)
     else:
         return empty_gradient
 
 
 def jacobian1(f, v):
     """jacobian of f wrt v"""
-    f = at.flatten(f)
-    idx = at.arange(f.shape[0], dtype="int32")
+    f = pt.flatten(f)
+    idx = pt.arange(f.shape[0], dtype="int32")
 
     def grad_i(i):
         return gradient1(f[i], v)
@@ -502,13 +502,13 @@ def jacobian(f, vars=None):
         vars = cont_inputs(f)
 
     if vars:
-        return at.concatenate([jacobian1(f, v) for v in vars], axis=1)
+        return pt.concatenate([jacobian1(f, v) for v in vars], axis=1)
     else:
         return empty_gradient
 
 
 def jacobian_diag(f, x):
-    idx = at.arange(f.shape[0], dtype="int32")
+    idx = pt.arange(f.shape[0], dtype="int32")
 
     def grad_ii(i, f, x):
         return grad(f[i], x)[i]
@@ -526,7 +526,7 @@ def hessian(f, vars=None):
 @pytensor.config.change_flags(compute_test_value="ignore")
 def hessian_diag1(f, v):
     g = gradient1(f, v)
-    idx = at.arange(g.shape[0], dtype="int32")
+    idx = pt.arange(g.shape[0], dtype="int32")
 
     def hess_ii(i):
         return gradient1(g[i], v)[i]
@@ -540,7 +540,7 @@ def hessian_diag(f, vars=None):
         vars = cont_inputs(f)
 
     if vars:
-        return -at.concatenate([hessian_diag1(f, v) for v in vars], axis=0)
+        return -pt.concatenate([hessian_diag1(f, v) for v in vars], axis=0)
     else:
         return empty_gradient
 
@@ -643,14 +643,14 @@ def join_nonshared_inputs(
 
     .. code-block:: python
 
-        import pytensor.tensor as at
+        import pytensor.tensor as pt
         import numpy as np
 
         from pymc.pytensorf import join_nonshared_inputs
 
         # Original non-shared inputs
-        x = at.scalar("x")
-        y = at.vector("y")
+        x = pt.scalar("x")
+        y = pt.vector("y")
         # Original output
         out = x + y
         print(out.eval({x: np.array(1), y: np.array([1, 2, 3])})) # [2, 3, 4]
@@ -725,7 +725,7 @@ def join_nonshared_inputs(
     if not inputs:
         raise ValueError("Empty list of input variables.")
 
-    raveled_inputs = at.concatenate([var.ravel() for var in inputs])
+    raveled_inputs = pt.concatenate([var.ravel() for var in inputs])
 
     if not make_inputs_shared:
         tensor_type = raveled_inputs.type
@@ -886,7 +886,7 @@ def ix_(*args):
     for k, new in enumerate(args):
         if new is None:
             out.append(slice(None))
-        new = at.as_tensor(new)
+        new = pt.as_tensor(new)
         if new.ndim != 1:
             raise ValueError("Cross index must be 1 dimensional")
         new = new.reshape((1,) * k + (new.size,) + (1,) * (nd - k - 1))
@@ -913,19 +913,21 @@ def local_remove_check_parameter(fgraph, node):
 
 @node_rewriter(tracks=[CheckParameterValue])
 def local_check_parameter_to_ninf_switch(fgraph, node):
-    if isinstance(node.op, CheckParameterValue):
-        logp_expr, *logp_conds = node.inputs
-        if len(logp_conds) > 1:
-            logp_cond = at.all(logp_conds)
-        else:
-            (logp_cond,) = logp_conds
-        out = at.switch(logp_cond, logp_expr, -np.inf)
-        out.name = node.op.msg
+    if not node.op.can_be_replaced_by_ninf:
+        return None
+
+    logp_expr, *logp_conds = node.inputs
+    if len(logp_conds) > 1:
+        logp_cond = pt.all(logp_conds)
+    else:
+        (logp_cond,) = logp_conds
+    out = pt.switch(logp_cond, logp_expr, -np.inf)
+    out.name = node.op.msg
 
-        if out.dtype != node.outputs[0].dtype:
-            out = at.cast(out, node.outputs[0].dtype)
+    if out.dtype != node.outputs[0].dtype:
+        out = pt.cast(out, node.outputs[0].dtype)
 
-        return [out]
+    return [out]
 
 
 pytensor.compile.optdb["canonicalize"].register(
@@ -968,7 +970,7 @@ def replace_rng_nodes(outputs: Sequence[TensorVariable]) -> Sequence[TensorVaria
     new_rng_nodes: List[Union[np.random.RandomState, np.random.Generator]] = []
     for rng_node in rng_nodes:
         rng_cls: type
-        if isinstance(rng_node, at.random.var.RandomStateSharedVariable):
+        if isinstance(rng_node, pt.random.var.RandomStateSharedVariable):
             rng_cls = np.random.RandomState
         else:
             rng_cls = np.random.Generator
@@ -990,7 +992,7 @@ def reseed_rngs(
     ]
     for rng, bit_generator in zip(rngs, bit_generators):
         new_rng: Union[np.random.RandomState, np.random.Generator]
-        if isinstance(rng, at.random.var.RandomStateSharedVariable):
+        if isinstance(rng, pt.random.var.RandomStateSharedVariable):
             new_rng = np.random.RandomState(bit_generator)
         else:
             new_rng = np.random.Generator(bit_generator)
@@ -998,42 +1000,85 @@ def reseed_rngs(
 
 
 def collect_default_updates(
-    inputs: Sequence[Variable], outputs: Sequence[Variable]
+    inputs: Sequence[Variable],
+    outputs: Sequence[Variable],
+    must_be_shared: bool = True,
 ) -> Dict[Variable, Variable]:
-    """Collect default update expression of RVs between inputs and outputs"""
+    """Collect default update expression for shared-variable RNGs used by RVs between inputs and outputs.
+
+    If `must_be_shared` is False, update expressions will also be returned for non-shared input RNGs.
+    This can be useful to obtain the symbolic update expressions from inner graphs.
+    """
 
     # Avoid circular import
     from pymc.distributions.distribution import SymbolicRandomVariable
 
+    def find_default_update(clients, rng: Variable) -> Union[None, Variable]:
+        rng_clients = clients.get(rng, None)
+
+        # Root case, RNG is not used elsewhere
+        if not rng_clients:
+            return rng
+
+        if len(rng_clients) > 1:
+            warnings.warn(
+                f"RNG Variable {rng} has multiple clients. This is likely an inconsistent random graph.",
+                UserWarning,
+            )
+            return None
+
+        [client, _] = rng_clients[0]
+
+        # RNG is an output of the function, this is not a problem
+        if client == "output":
+            return rng
+
+        # RNG is used by another operator, which should output an update for the RNG
+        if isinstance(client.op, RandomVariable):
+            # RandomVariable first output is always the update of the input RNG
+            next_rng = client.outputs[0]
+
+        elif isinstance(client.op, SymbolicRandomVariable):
+            # SymbolicRandomVariable have an explicit method that returns an
+            # update mapping for their RNG(s)
+            next_rng = client.op.update(client).get(rng)
+            if next_rng is None:
+                raise ValueError(
+                    f"No update mapping found for RNG used in SymbolicRandomVariable Op {client.op}"
+                )
+        else:
+            # We don't know how this RNG should be updated (e.g., Scan).
+            # The user should provide an update manually
+            return None
+
+        # Recurse until we find final update for RNG
+        return find_default_update(clients, next_rng)
+
+    outputs = makeiter(outputs)
+    fg = FunctionGraph(outputs=outputs, clone=False)
+    clients = fg.clients
+
     rng_updates = {}
-    output_to_list = outputs if isinstance(outputs, (list, tuple)) else [outputs]
-    for random_var in (
-        var
-        for var in vars_between(inputs, output_to_list)
-        if var.owner
-        and isinstance(var.owner.op, (RandomVariable, SymbolicRandomVariable))
-        and var not in inputs
+    # Iterate over input RNGs. Only consider shared RNGs if `must_be_shared==True`
+    for input_rng in (
+        inp
+        for inp in graph_inputs(outputs, blockers=inputs)
+        if (
+            (not must_be_shared or isinstance(inp, SharedVariable))
+            and isinstance(inp.type, RandomType)
+        )
     ):
-        # All nodes in `vars_between(inputs, outputs)` have owners.
-        # But mypy doesn't know, so we just assert it:
-        assert random_var.owner.op is not None
-        if isinstance(random_var.owner.op, RandomVariable):
-            rng = random_var.owner.inputs[0]
-            if getattr(rng, "default_update", None) is not None:
-                update_map = {rng: rng.default_update}
-            else:
-                update_map = {rng: random_var.owner.outputs[0]}
+        # Even if an explicit default update is provided, we call it to
+        # issue any warnings about invalid random graphs.
+        default_update = find_default_update(clients, input_rng)
+
+        # Respect default update if provided
+        if getattr(input_rng, "default_update", None):
+            rng_updates[input_rng] = input_rng.default_update
         else:
-            update_map = random_var.owner.op.update(random_var.owner)
-        # Check that we are not setting different update expressions for the same variables
-        for rng, update in update_map.items():
-            if rng not in rng_updates:
-                rng_updates[rng] = update
-            # When a variable has multiple outputs, it will be called twice with the same
-            # update expression. We don't want to raise in that case, only if the update
-            # expression in different from the one already registered
-            elif rng_updates[rng] is not update:
-                raise ValueError(f"Multiple update expressions found for the variable {rng}")
+            if default_update is not None:
+                rng_updates[input_rng] = default_update
+
     return rng_updates
 
 
diff --git a/pymc/sampling/forward.py b/pymc/sampling/forward.py
index 7c4fa7e4cb..fd0260c3ae 100644
--- a/pymc/sampling/forward.py
+++ b/pymc/sampling/forward.py
@@ -36,7 +36,7 @@
 
 from arviz import InferenceData
 from fastprogress.fastprogress import progress_bar
-from pytensor import tensor as at
+from pytensor import tensor as pt
 from pytensor.graph.basic import (
     Apply,
     Constant,
@@ -253,7 +253,7 @@ def expand(node):
             node,
             value
             if isinstance(value, (Variable, Apply))
-            else at.constant(value, dtype=getattr(node, "dtype", None), name=node.name),
+            else pt.constant(value, dtype=getattr(node, "dtype", None), name=node.name),
         )
         for node, value in givens_dict.items()
     ]
@@ -481,8 +481,7 @@ def sample_posterior_predictive(
         Whether to automatically use :meth:`arviz.InferenceData.extend` to add the posterior predictive samples to
         ``trace`` or not. If True, ``trace`` is modified inplace but still returned.
     predictions : bool, default False
-        Choose the function used to convert the samples to inferencedata. See ``idata_kwargs``
-        for more details.
+        Flag used to set the location of posterior predictive samples within the returned ``arviz.InferenceData`` object. If False, assumes samples are generated based on the fitting data to be used for posterior predictive checks, and samples are stored in the ``posterior_predictive``. If True, assumes samples are generated based on out-of-sample data as predictions, and samples are stored in the ``predictions`` group.
     idata_kwargs : dict, optional
         Keyword arguments for :func:`pymc.to_inference_data` if ``predictions=False`` or to
         :func:`pymc.predictions_to_inference_data` otherwise.
diff --git a/pymc/sampling/jax.py b/pymc/sampling/jax.py
index 0cdc8afd0c..43741f8e87 100644
--- a/pymc/sampling/jax.py
+++ b/pymc/sampling/jax.py
@@ -22,7 +22,7 @@
 import arviz as az
 import jax
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from arviz.data.base import make_attrs
 from jax.experimental.maps import SerialLoop, xmap
@@ -96,7 +96,7 @@ def _replace_shared_variables(graph: List[TensorVariable]) -> List[TensorVariabl
             "be safely replaced."
         )
 
-    replacements = {var: at.constant(var.get_value(borrow=True)) for var in shared_variables}
+    replacements = {var: pt.constant(var.get_value(borrow=True)) for var in shared_variables}
 
     new_graph = clone_replace(graph, replace=replacements)
     return new_graph
@@ -317,6 +317,7 @@ def sample_blackjax_nuts(
     postprocessing_backend: Optional[str] = None,
     postprocessing_chunks: Optional[int] = None,
     idata_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ) -> az.InferenceData:
     """
     Draw samples from the posterior using the NUTS method from the ``blackjax`` library.
@@ -529,6 +530,7 @@ def sample_numpyro_nuts(
     postprocessing_chunks: Optional[int] = None,
     idata_kwargs: Optional[Dict] = None,
     nuts_kwargs: Optional[Dict] = None,
+    **kwargs,
 ) -> az.InferenceData:
     """
     Draw samples from the posterior using the NUTS method from the ``numpyro`` library.
diff --git a/pymc/sampling/mcmc.py b/pymc/sampling/mcmc.py
index 0449b029f8..4ec7b254b8 100644
--- a/pymc/sampling/mcmc.py
+++ b/pymc/sampling/mcmc.py
@@ -237,10 +237,14 @@ def _sample_external_nuts(
     model: Model,
     progressbar: bool,
     idata_kwargs: Optional[Dict],
+    nuts_sampler_kwargs: Optional[Dict],
     **kwargs,
 ):
     warnings.warn("Use of external NUTS sampler is still experimental", UserWarning)
 
+    if nuts_sampler_kwargs is None:
+        nuts_sampler_kwargs = {}
+
     if sampler == "nutpie":
         try:
             import nutpie
@@ -271,7 +275,7 @@ def _sample_external_nuts(
             target_accept=target_accept,
             seed=_get_seeds_per_chain(random_seed, 1)[0],
             progress_bar=progressbar,
-            **kwargs,
+            **nuts_sampler_kwargs,
         )
         return idata
 
@@ -288,7 +292,7 @@ def _sample_external_nuts(
             model=model,
             progressbar=progressbar,
             idata_kwargs=idata_kwargs,
-            **kwargs,
+            **nuts_sampler_kwargs,
         )
         return idata
 
@@ -304,7 +308,7 @@ def _sample_external_nuts(
             initvals=initvals,
             model=model,
             idata_kwargs=idata_kwargs,
-            **kwargs,
+            **nuts_sampler_kwargs,
         )
         return idata
 
@@ -334,6 +338,7 @@ def sample(
     keep_warning_stat: bool = False,
     return_inferencedata: bool = True,
     idata_kwargs: Optional[Dict[str, Any]] = None,
+    nuts_sampler_kwargs: Optional[Dict[str, Any]] = None,
     callback=None,
     mp_ctx=None,
     model: Optional[Model] = None,
@@ -410,6 +415,9 @@ def sample(
         `MultiTrace` (False). Defaults to `True`.
     idata_kwargs : dict, optional
         Keyword arguments for :func:`pymc.to_inference_data`
+    nuts_sampler_kwargs : dict, optional
+        Keyword arguments for the sampling library that implements nuts.
+        Only used when an external sampler is specified via the `nuts_sampler` kwarg.
     callback : function, default=None
         A function which gets called for every sample from the trace of a chain. The function is
         called with the trace and the current draw and will contain all samples for a single trace.
@@ -493,6 +501,8 @@ def sample(
             stacklevel=2,
         )
         initvals = kwargs.pop("start")
+    if nuts_sampler_kwargs is None:
+        nuts_sampler_kwargs = {}
     if "target_accept" in kwargs:
         if "nuts" in kwargs and "target_accept" in kwargs["nuts"]:
             raise ValueError(
@@ -569,6 +579,7 @@ def sample(
             model=model,
             progressbar=progressbar,
             idata_kwargs=idata_kwargs,
+            nuts_sampler_kwargs=nuts_sampler_kwargs,
             **kwargs,
         )
 
diff --git a/pymc/smc/kernels.py b/pymc/smc/kernels.py
index d56133ab1e..70db7420f3 100644
--- a/pymc/smc/kernels.py
+++ b/pymc/smc/kernels.py
@@ -19,7 +19,7 @@
 from typing import Dict, Union, cast
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.graph.replace import clone_replace
 from scipy.special import logsumexp
@@ -617,9 +617,9 @@ def _logp_forw(point, out_vars, in_vars, shared):
         new_in_vars = []
         for in_var in in_vars:
             if in_var.dtype in discrete_types:
-                float_var = at.TensorType("floatX", in_var.type.shape)(in_var.name)
+                float_var = pt.TensorType("floatX", in_var.type.shape)(in_var.name)
                 new_in_vars.append(float_var)
-                replace_int_input[in_var] = at.round(float_var).astype(in_var.dtype)
+                replace_int_input[in_var] = pt.round(float_var).astype(in_var.dtype)
             else:
                 new_in_vars.append(in_var)
 
diff --git a/pymc/stats/convergence.py b/pymc/stats/convergence.py
index 848e681f8c..dcea7f56d7 100644
--- a/pymc/stats/convergence.py
+++ b/pymc/stats/convergence.py
@@ -154,17 +154,17 @@ def warn_treedepth(idata: arviz.InferenceData) -> List[SamplerWarning]:
     if sampler_stats is None:
         return []
 
-    treedepth = sampler_stats.get("tree_depth", None)
-    if treedepth is None:
+    rmtd = sampler_stats.get("reached_max_treedepth", None)
+    if rmtd is None:
         return []
 
     warnings = []
-    for c in treedepth.chain:
-        if sum(treedepth.sel(chain=c)) / treedepth.sizes["draw"] > 0.05:
+    for c in rmtd.chain:
+        if sum(rmtd.sel(chain=c)) / rmtd.sizes["draw"] > 0.05:
             warnings.append(
                 SamplerWarning(
                     WarningType.TREEDEPTH,
-                    f"Chain {c} reached the maximum tree depth."
+                    f"Chain {int(c)} reached the maximum tree depth."
                     " Increase `max_treedepth`, increase `target_accept` or reparameterize.",
                     "warn",
                 )
diff --git a/pymc/step_methods/metropolis.py b/pymc/step_methods/metropolis.py
index 0ee10a24d6..f3ba5f152f 100644
--- a/pymc/step_methods/metropolis.py
+++ b/pymc/step_methods/metropolis.py
@@ -19,7 +19,7 @@
 import scipy.linalg
 import scipy.special
 
-from pytensor import tensor as at
+from pytensor import tensor as pt
 from pytensor.graph.fg import MissingInputError
 from pytensor.tensor.random.basic import BernoulliRV, CategoricalRV
 
@@ -1018,9 +1018,9 @@ def sample_except(limit, excluded):
 
 def delta_logp(
     point: Dict[str, np.ndarray],
-    logp: at.TensorVariable,
-    vars: List[at.TensorVariable],
-    shared: Dict[at.TensorVariable, at.sharedvar.TensorSharedVariable],
+    logp: pt.TensorVariable,
+    vars: List[pt.TensorVariable],
+    shared: Dict[pt.TensorVariable, pt.sharedvar.TensorSharedVariable],
 ) -> pytensor.compile.Function:
     [logp0], inarray0 = join_nonshared_inputs(
         point=point, outputs=[logp], inputs=vars, shared_inputs=shared
diff --git a/tests/distributions/util.py b/pymc/testing.py
similarity index 64%
rename from tests/distributions/util.py
rename to pymc/testing.py
index 9563d1e4f5..86f2910a2b 100644
--- a/tests/distributions/util.py
+++ b/pymc/testing.py
@@ -14,29 +14,53 @@
 import functools as ft
 import itertools as it
 
-from contextlib import ExitStack as does_not_raise
-from typing import Callable, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-import numpy.random as nr
-import numpy.testing as npt
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
-import scipy.special as sp
-import scipy.stats as st
 
+from numpy import random as nr
+from numpy import testing as npt
 from pytensor.compile.mode import Mode
+from pytensor.graph.basic import walk
+from pytensor.graph.op import HasInnerGraph
+from pytensor.graph.rewriting.basic import in2out
+from pytensor.tensor import TensorVariable
+from pytensor.tensor.random.op import RandomVariable
+from scipy import special as sp
+from scipy import stats as st
 
 import pymc as pm
 
+from pymc.distributions.distribution import Distribution
 from pymc.distributions.shape_utils import change_dist_size
 from pymc.initial_point import make_initial_point_fn
-from pymc.logprob.abstract import logcdf
-from pymc.logprob.joint_logprob import joint_logp, logp
+from pymc.logprob.abstract import MeasurableVariable
+from pymc.logprob.basic import icdf, joint_logp, logcdf, logp
 from pymc.logprob.utils import ParameterValueError
-from pymc.pytensorf import compile_pymc, floatX, intX
-from tests.helpers import SeededTest, select_by_precision
+from pymc.pytensorf import (
+    compile_pymc,
+    floatX,
+    inputvars,
+    intX,
+    local_check_parameter_to_ninf_switch,
+)
+
+# This mode can be used for tests where model compilations takes the bulk of the runtime
+# AND where we don't care about posterior numerical or sampling stability (e.g., when
+# all that matters are the shape of the draws or deterministic values of observed data).
+# DO NOT USE UNLESS YOU HAVE A GOOD REASON TO!
+fast_unstable_sampling_mode = (
+    pytensor.compile.mode.FAST_COMPILE
+    # Remove slow rewrite phases
+    .excluding("canonicalize", "specialize")
+    # Include necessary rewrites for proper logp handling
+    .including("remove_TransformedVariables").register(
+        (in2out(local_check_parameter_to_ninf_switch), -1)
+    )
+)
 
 
 def product(domains, n_samples=-1):
@@ -47,7 +71,8 @@ def product(domains, n_samples=-1):
                  must be "domain-like", as in, have a `.vals` property
         n_samples: int, maximum samples to return.  -1 to return whole product
 
-    Returns:
+    Returns
+    -------
         list of the cartesian product of the domains
     """
     try:
@@ -119,22 +144,6 @@ def __neg__(self):
         return Domain([-v for v in self.vals], self.dtype, (-self.lower, -self.upper), self.shape)
 
 
-@pytest.mark.parametrize(
-    "values, edges, expectation",
-    [
-        ([], None, pytest.raises(IndexError)),
-        ([], (0, 0), pytest.raises(ValueError)),
-        ([0], None, pytest.raises(ValueError)),
-        ([0], (0, 0), does_not_raise()),
-        ([-1, 1], None, pytest.raises(ValueError)),
-        ([-1, 0, 1], None, does_not_raise()),
-    ],
-)
-def test_domain(values, edges, expectation):
-    with expectation:
-        Domain(values, edges=edges)
-
-
 class ProductDomain:
     def __init__(self, domains):
         self.vals = list(it.product(*(d.vals for d in domains)))
@@ -203,24 +212,25 @@ def RandomPdMatrix(n):
 Rplusbig = Domain([0, 0.5, 0.9, 0.99, 1, 1.5, 2, 20, np.inf])
 Rminusbig = Domain([-np.inf, -2, -1.5, -1, -0.99, -0.9, -0.5, -0.01, 0])
 Unit = Domain([0, 0.001, 0.1, 0.5, 0.75, 0.99, 1])
-
 Circ = Domain([-np.pi, -2.1, -1, -0.01, 0.0, 0.01, 1, 2.1, np.pi])
-
 Runif = Domain([-np.inf, -0.4, 0, 0.4, np.inf])
 Rdunif = Domain([-np.inf, -1, 0, 1, np.inf], "int64")
 Rplusunif = Domain([0, 0.5, np.inf])
 Rplusdunif = Domain([0, 10, np.inf], "int64")
-
 I = Domain([-np.inf, -3, -2, -1, 0, 1, 2, 3, np.inf], "int64")
-
 NatSmall = Domain([0, 3, 4, 5, np.inf], "int64")
 Nat = Domain([0, 1, 2, 3, np.inf], "int64")
 NatBig = Domain([0, 1, 2, 3, 5000, np.inf], "int64")
 PosNat = Domain([1, 2, 3, np.inf], "int64")
-
 Bool = Domain([0, 0, 1, 1], "int64")
 
 
+def select_by_precision(float64, float32):
+    """Helper function to choose reasonable decimal cutoffs for different floatX modes."""
+    decimal = float64 if pytensor.config.floatX == "float64" else float32
+    return decimal
+
+
 def build_model(distfam, valuedomain, vardomains, extra_args=None):
     if extra_args is None:
         extra_args = {}
@@ -228,9 +238,9 @@ def build_model(distfam, valuedomain, vardomains, extra_args=None):
     with pm.Model() as m:
         param_vars = {}
         for v, dom in vardomains.items():
-            v_at = pytensor.shared(np.asarray(dom.vals[0]))
-            v_at.name = v
-            param_vars[v] = v_at
+            v_pt = pytensor.shared(np.asarray(dom.vals[0]))
+            v_pt.name = v
+            param_vars[v] = v_pt
         param_vars.update(extra_args)
         distfam(
             "value",
@@ -240,17 +250,69 @@ def build_model(distfam, valuedomain, vardomains, extra_args=None):
     return m, param_vars
 
 
+def create_dist_from_paramdomains(
+    pymc_dist: Distribution,
+    paramdomains: Dict[str, Domain],
+    extra_args: Optional[Dict[str, Any]] = None,
+) -> TensorVariable:
+    """Create a PyMC distribution from a dictionary of parameter domains.
+
+    Returns
+    -------
+        PyMC distribution variable: TensorVariable
+        Value variable: TensorVariable
+    """
+    if extra_args is None:
+        extra_args = {}
+
+    param_vars = {}
+    for param, domain in paramdomains.items():
+        param_type = pt.constant(np.asarray(domain.vals[0])).type()
+        param_type.name = param
+        param_vars[param] = param_type
+
+    return pymc_dist.dist(**param_vars, **extra_args)
+
+
+def find_invalid_scalar_params(
+    paramdomains: Dict["str", Domain]
+) -> Dict["str", Tuple[Union[None, float], Union[None, float]]]:
+    """Find invalid parameter values from bounded scalar parameter domains.
+
+    For use in `check_logp`-like testing helpers.
+
+    Returns
+    -------
+    Invalid paramemeter values:
+        Dictionary mapping each parameter, to a lower and upper invalid values (out of domain).
+        If no lower or upper invalid values exist, None is returned for that entry.
+    """
+    invalid_params = {}
+    for param, paramdomain in paramdomains.items():
+        lower_edge, upper_edge = None, None
+
+        if np.ndim(paramdomain.lower) == 0:
+            if np.isfinite(paramdomain.lower):
+                lower_edge = paramdomain.lower - 1
+
+            if np.isfinite(paramdomain.upper):
+                upper_edge = paramdomain.upper + 1
+
+        invalid_params[param] = (lower_edge, upper_edge)
+    return invalid_params
+
+
 def check_logp(
-    pymc_dist,
-    domain,
-    paramdomains,
-    scipy_logp,
-    decimal=None,
-    n_samples=100,
-    extra_args=None,
-    scipy_args=None,
-    skip_paramdomain_outside_edge_test=False,
-):
+    pymc_dist: Distribution,
+    domain: Domain,
+    paramdomains: Dict[str, Domain],
+    scipy_logp: Callable,
+    decimal: Optional[int] = None,
+    n_samples: int = 100,
+    extra_args: Optional[Dict[str, Any]] = None,
+    scipy_args: Optional[Dict[str, Any]] = None,
+    skip_paramdomain_outside_edge_test: bool = False,
+) -> None:
     """
     Generic test for PyMC logp methods
 
@@ -285,122 +347,77 @@ def check_logp(
     if decimal is None:
         decimal = select_by_precision(float64=6, float32=3)
 
-    if extra_args is None:
-        extra_args = {}
-
     if scipy_args is None:
         scipy_args = {}
 
-    def logp_reference(args):
+    def scipy_logp_with_scipy_args(**args):
         args.update(scipy_args)
         return scipy_logp(**args)
 
-    def _model_input_dict(model, param_vars, pt):
-        """Create a dict with only the necessary, transformed logp inputs."""
-        pt_d = {}
-        for k, v in pt.items():
-            rv_var = model.named_vars.get(k)
-            nv = param_vars.get(k, rv_var)
-            nv = model.rvs_to_values.get(nv, nv)
-
-            transform = model.rvs_to_transforms.get(rv_var, None)
-            if transform:
-                # todo: the compiled graph behind this should be cached and
-                # reused (if it isn't already).
-                v = transform.forward(rv_var, v).eval()
+    dist = create_dist_from_paramdomains(pymc_dist, paramdomains, extra_args)
+    value = dist.type()
+    value.name = "value"
+    pymc_dist_logp = logp(dist, value).sum()
+    pymc_logp = pytensor.function(list(inputvars(pymc_dist_logp)), pymc_dist_logp)
 
-            if nv.name in param_vars:
-                # update the shared parameter variables in `param_vars`
-                param_vars[nv.name].set_value(v)
-            else:
-                # create an argument entry for the (potentially
-                # transformed) "value" variable
-                pt_d[nv.name] = v
-
-        return pt_d
-
-    model, param_vars = build_model(pymc_dist, domain, paramdomains, extra_args)
-    logp_pymc = model.compile_logp(jacobian=False)
-
-    # Test supported value and parameters domain matches scipy
+    # Test supported value and parameters domain matches Scipy
     domains = paramdomains.copy()
     domains["value"] = domain
-    for pt in product(domains, n_samples=n_samples):
-        pt = dict(pt)
-        pt_d = _model_input_dict(model, param_vars, pt)
-        pt_logp = pm.Point(pt_d, model=model)
-        pt_ref = pm.Point(pt, filter_model_vars=False, model=model)
+    for point in product(domains, n_samples=n_samples):
+        point = dict(point)
         npt.assert_almost_equal(
-            logp_pymc(pt_logp),
-            logp_reference(pt_ref),
+            pymc_logp(**point),
+            scipy_logp_with_scipy_args(**point),
             decimal=decimal,
-            err_msg=str(pt),
+            err_msg=str(point),
         )
 
     valid_value = domain.vals[0]
     valid_params = {param: paramdomain.vals[0] for param, paramdomain in paramdomains.items()}
-    valid_dist = pymc_dist.dist(**valid_params, **extra_args)
+    valid_params["value"] = valid_value
 
     # Test pymc distribution raises ParameterValueError for scalar parameters outside
     # the supported domain edges (excluding edges)
     if not skip_paramdomain_outside_edge_test:
-        # Step1: collect potential invalid parameters
-        invalid_params = {param: [None, None] for param in paramdomains}
-        for param, paramdomain in paramdomains.items():
-            if np.ndim(paramdomain.lower) != 0:
-                continue
-            if np.isfinite(paramdomain.lower):
-                invalid_params[param][0] = paramdomain.lower - 1
-            if np.isfinite(paramdomain.upper):
-                invalid_params[param][1] = paramdomain.upper + 1
+        invalid_params = find_invalid_scalar_params(paramdomains)
 
-        # Step2: test invalid parameters, one a time
         for invalid_param, invalid_edges in invalid_params.items():
             for invalid_edge in invalid_edges:
                 if invalid_edge is None:
                     continue
-                test_params = valid_params.copy()  # Shallow copy should be okay
-                test_params[invalid_param] = at.as_tensor_variable(invalid_edge)
-                # We need to remove `Assert`s introduced by checks like
-                # `assert_negative_support` and disable test values;
-                # otherwise, we won't be able to create the `RandomVariable`
-                with pytensor.config.change_flags(compute_test_value="off"):
-                    invalid_dist = pymc_dist.dist(**test_params, **extra_args)
-                with pytensor.config.change_flags(mode=Mode("py")):
-                    with pytest.raises(ParameterValueError):
-                        logp(invalid_dist, valid_value).eval()
-                        pytest.fail(f"test_params={test_params}, valid_value={valid_value}")
+
+                point = valid_params.copy()  # Shallow copy should be okay
+                point[invalid_param] = invalid_edge
+                with pytest.raises(ParameterValueError):
+                    pymc_logp(**point)
+                    pytest.fail(f"test_params={point}")
 
     # Test that values outside of scalar domain support evaluate to -np.inf
-    if np.ndim(domain.lower) != 0:
-        return
-    invalid_values = [None, None]
-    if np.isfinite(domain.lower):
-        invalid_values[0] = domain.lower - 1
-    if np.isfinite(domain.upper):
-        invalid_values[1] = domain.upper + 1
+    invalid_values = find_invalid_scalar_params({"value": domain})["value"]
 
     for invalid_value in invalid_values:
         if invalid_value is None:
             continue
-        with pytensor.config.change_flags(mode=Mode("py")):
-            npt.assert_equal(
-                logp(valid_dist, invalid_value).eval(),
-                -np.inf,
-                err_msg=str(invalid_value),
-            )
+
+        point = valid_params.copy()
+        point["value"] = invalid_value
+        npt.assert_equal(
+            pymc_logp(**point),
+            -np.inf,
+            err_msg=str(point),
+        )
 
 
 def check_logcdf(
-    pymc_dist,
-    domain,
-    paramdomains,
-    scipy_logcdf,
-    decimal=None,
-    n_samples=100,
-    skip_paramdomain_inside_edge_test=False,
-    skip_paramdomain_outside_edge_test=False,
-):
+    pymc_dist: Distribution,
+    domain: Domain,
+    paramdomains: Dict[str, Domain],
+    scipy_logcdf: Callable,
+    decimal: Optional[int] = None,
+    n_samples: int = 100,
+    skip_paramdomain_inside_edge_test: bool = False,
+    skip_paramdomain_outside_edge_test: bool = False,
+) -> None:
     """
     Generic test for PyMC logcdf methods
 
@@ -441,139 +458,202 @@ def check_logcdf(
         Whether to run test 2., which checks that pymc distribution logcdf
         returns -inf for invalid parameter values outside the supported domain edge
 
-    Returns
-    -------
-
     """
+    if decimal is None:
+        decimal = select_by_precision(float64=6, float32=3)
+
+    dist = create_dist_from_paramdomains(pymc_dist, paramdomains)
+    value = dist.type()
+    value.name = "value"
+    dist_logcdf = logcdf(dist, value)
+    pymc_logcdf = pytensor.function(list(inputvars(dist_logcdf)), dist_logcdf)
+
     # Test pymc and scipy distributions match for values and parameters
     # within the supported domain edges (excluding edges)
     if not skip_paramdomain_inside_edge_test:
         domains = paramdomains.copy()
         domains["value"] = domain
-
-        model, param_vars = build_model(pymc_dist, domain, paramdomains)
-        rv = model["value"]
-        value = model.rvs_to_values[rv]
-        pymc_logcdf = model.compile_fn(logcdf(rv, value))
-
-        if decimal is None:
-            decimal = select_by_precision(float64=6, float32=3)
-
-        for pt in product(domains, n_samples=n_samples):
-            params = dict(pt)
-            scipy_eval = scipy_logcdf(**params)
-
-            value = params.pop("value")
-            # Update shared parameter variables in pymc_logcdf function
-            for param_name, param_value in params.items():
-                param_vars[param_name].set_value(param_value)
-            pymc_eval = pymc_logcdf({"value": value})
-
-            params["value"] = value  # for displaying in err_msg
+        for point in product(domains, n_samples=n_samples):
+            point = dict(point)
             npt.assert_almost_equal(
-                pymc_eval,
-                scipy_eval,
+                pymc_logcdf(**point),
+                scipy_logcdf(**point),
                 decimal=decimal,
-                err_msg=str(params),
+                err_msg=str(point),
             )
 
     valid_value = domain.vals[0]
     valid_params = {param: paramdomain.vals[0] for param, paramdomain in paramdomains.items()}
-    valid_dist = pymc_dist.dist(**valid_params)
+    valid_params["value"] = valid_value
 
     # Test pymc distribution raises ParameterValueError for parameters outside the
     # supported domain edges (excluding edges)
     if not skip_paramdomain_outside_edge_test:
-        # Step1: collect potential invalid parameters
-        invalid_params = {param: [None, None] for param in paramdomains}
-        for param, paramdomain in paramdomains.items():
-            if np.isfinite(paramdomain.lower):
-                invalid_params[param][0] = paramdomain.lower - 1
-            if np.isfinite(paramdomain.upper):
-                invalid_params[param][1] = paramdomain.upper + 1
-        # Step2: test invalid parameters, one a time
+        invalid_params = find_invalid_scalar_params(paramdomains)
+
         for invalid_param, invalid_edges in invalid_params.items():
             for invalid_edge in invalid_edges:
-                if invalid_edge is not None:
-                    test_params = valid_params.copy()  # Shallow copy should be okay
-                    test_params[invalid_param] = at.as_tensor_variable(invalid_edge)
-                    # We need to remove `Assert`s introduced by checks like
-                    # `assert_negative_support` and disable test values;
-                    # otherwise, we won't be able to create the
-                    # `RandomVariable`
-                    with pytensor.config.change_flags(compute_test_value="off"):
-                        invalid_dist = pymc_dist.dist(**test_params)
-                    with pytensor.config.change_flags(mode=Mode("py")):
-                        with pytest.raises(ParameterValueError):
-                            logcdf(invalid_dist, valid_value).eval()
-
-    # Test that values below domain edge evaluate to -np.inf
-    if np.isfinite(domain.lower):
-        below_domain = domain.lower - 1
-        with pytensor.config.change_flags(mode=Mode("py")):
-            npt.assert_equal(
-                logcdf(valid_dist, below_domain).eval(),
-                -np.inf,
-                err_msg=str(below_domain),
-            )
+                if invalid_edge is None:
+                    continue
 
-    # Test that values above domain edge evaluate to 0
-    if np.isfinite(domain.upper):
-        above_domain = domain.upper + 1
-        with pytensor.config.change_flags(mode=Mode("py")):
-            npt.assert_equal(
-                logcdf(valid_dist, above_domain).eval(),
-                0,
-                err_msg=str(above_domain),
-            )
+                point = valid_params.copy()
+                point[invalid_param] = invalid_edge
+                with pytest.raises(ParameterValueError):
+                    pymc_logcdf(**point)
+                    pytest.fail(f"test_params={point}")
+
+    # Test that values below domain edge evaluate to -np.inf, and above evaluates to 0
+    invalid_lower, invalid_upper = find_invalid_scalar_params({"value": domain})["value"]
+    if invalid_lower is not None:
+        point = valid_params.copy()
+        point["value"] = invalid_lower
+        npt.assert_equal(
+            pymc_logcdf(**point),
+            -np.inf,
+            err_msg=str(point),
+        )
+    if invalid_upper is not None:
+        point = valid_params.copy()
+        point["value"] = invalid_upper
+        npt.assert_equal(
+            pymc_logcdf(**point),
+            0,
+            err_msg=str(point),
+        )
 
-    # Test that method works with multiple values or raises informative TypeError
-    valid_dist = pymc_dist.dist(**valid_params, size=2)
-    with pytensor.config.change_flags(mode=Mode("py")):
-        try:
-            logcdf(valid_dist, np.array([valid_value, valid_value])).eval()
-        except TypeError as err:
-            assert str(err).endswith(
-                "logcdf expects a scalar value but received a 1-dimensional object."
+
+def check_icdf(
+    pymc_dist: Distribution,
+    paramdomains: Dict[str, Domain],
+    scipy_icdf: Callable,
+    skip_paramdomain_outside_edge_test=False,
+    decimal: Optional[int] = None,
+    n_samples: int = 100,
+) -> None:
+    """
+    Generic test for PyMC icdf methods
+
+    The following tests are performed by default:
+        1. Test PyMC icdf and equivalent scipy icdf (ppf) methods give similar
+        results for parameters inside the supported edges.
+        Edges are excluded by default, but can be artificially included by
+        creating a domain with repeated values (e.g., `Domain([0, 0, .5, 1, 1]`)
+        2. Test PyMC icdf method raises for invalid parameter values
+        outside the supported edges.
+        3. Test PyMC icdf method returns np.nan for values below 0 or above 1,
+         when using valid parameters.
+
+    Parameters
+    ----------
+    pymc_dist: PyMC distribution
+    paramdomains : Dictionary of Parameter : Domain pairs
+        Supported domains of distribution parameters
+    scipy_icdf : Scipy icdf method
+        Scipy icdf (ppf) method of equivalent pymc_dist distribution
+    decimal : int, optional
+        Level of precision with which pymc_dist and scipy_icdf are compared.
+        Defaults to 6 for float64 and 3 for float32
+    n_samples : int
+        Upper limit on the number of valid domain and value combinations that
+        are compared between pymc and scipy methods. If n_samples is below the
+        total number of combinations, a random subset is evaluated. Setting
+        n_samples = -1, will return all possible combinations. Defaults to 100
+    skip_paradomain_outside_edge_test : Bool
+        Whether to run test 2., which checks that pymc distribution icdf
+        returns nan for invalid parameter values outside the supported domain edge
+
+    """
+    if decimal is None:
+        decimal = select_by_precision(float64=6, float32=3)
+
+    dist = create_dist_from_paramdomains(pymc_dist, paramdomains)
+    q = pt.scalar(dtype="float64", name="q")
+    dist_icdf = icdf(dist, q)
+    pymc_icdf = pytensor.function(list(inputvars(dist_icdf)), dist_icdf)
+
+    # Test pymc and scipy distributions match for values and parameters
+    # within the supported domain edges (excluding edges)
+    domains = paramdomains.copy()
+    domain = Domain([0, 0.1, 0.5, 0.75, 0.95, 0.99, 1])  # Values we test the icdf at
+    domains["q"] = domain
+
+    for point in product(domains, n_samples=n_samples):
+        point = dict(point)
+        npt.assert_almost_equal(
+            pymc_icdf(**point),
+            scipy_icdf(**point),
+            decimal=decimal,
+            err_msg=str(point),
+        )
+
+    valid_value = domain.vals[0]
+    valid_params = {param: paramdomain.vals[0] for param, paramdomain in paramdomains.items()}
+    valid_params["q"] = valid_value
+
+    if not skip_paramdomain_outside_edge_test:
+        # Test pymc distribution raises ParameterValueError for parameters outside the
+        # supported domain edges (excluding edges)
+        invalid_params = find_invalid_scalar_params(paramdomains)
+        for invalid_param, invalid_edges in invalid_params.items():
+            for invalid_edge in invalid_edges:
+                if invalid_edge is None:
+                    continue
+
+                point = valid_params.copy()
+                point[invalid_param] = invalid_edge
+                with pytest.raises(ParameterValueError):
+                    pymc_icdf(**point)
+                    pytest.fail(f"test_params={point}")
+
+    # Test that values below 0 or above 1 evaluate to nan
+    invalid_values = find_invalid_scalar_params({"q": domain})["q"]
+    for invalid_value in invalid_values:
+        if invalid_value is not None:
+            point = valid_params.copy()
+            point["q"] = invalid_value
+            npt.assert_equal(
+                pymc_icdf(**point),
+                np.nan,
+                err_msg=str(point),
             )
 
 
 def check_selfconsistency_discrete_logcdf(
-    distribution,
-    domain,
-    paramdomains,
-    decimal=None,
-    n_samples=100,
-):
+    distribution: Distribution,
+    domain: Domain,
+    paramdomains: Dict[str, Domain],
+    decimal: Optional[int] = None,
+    n_samples: int = 100,
+) -> None:
     """
-    Check that logcdf of discrete distributions matches sum of logps up to value
+    Check that logcdf of discrete distributions matches sum of logps up to value.
     """
-    domains = paramdomains.copy()
-    domains["value"] = domain
     if decimal is None:
         decimal = select_by_precision(float64=6, float32=3)
 
-    model, param_vars = build_model(distribution, domain, paramdomains)
-    rv = model["value"]
-    value = model.rvs_to_values[rv]
-    dist_logcdf = model.compile_fn(logcdf(rv, value))
-    dist_logp = model.compile_fn(logp(rv, value))
+    dist = create_dist_from_paramdomains(distribution, paramdomains)
+    value = dist.type()
+    value.name = "value"
+    dist_logp = logp(dist, value)
+    dist_logp_fn = pytensor.function(list(inputvars(dist_logp)), dist_logp)
 
-    for pt in product(domains, n_samples=n_samples):
-        params = dict(pt)
-        value = params.pop("value")
-        values = np.arange(domain.lower, value + 1)
+    dist_logcdf = logcdf(dist, value)
+    dist_logcdf_fn = compile_pymc(list(inputvars(dist_logcdf)), dist_logcdf)
+
+    domains = paramdomains.copy()
+    domains["value"] = domain
 
-        # Update shared parameter variables in logp/logcdf function
-        for param_name, param_value in params.items():
-            param_vars[param_name].set_value(param_value)
+    for point in product(domains, n_samples=n_samples):
+        point = dict(point)
+        value = point.pop("value")
+        values = np.arange(domain.lower, value + 1)
 
         with pytensor.config.change_flags(mode=Mode("py")):
             npt.assert_almost_equal(
-                dist_logcdf({"value": value}),
-                sp.logsumexp([dist_logp({"value": value}) for value in values]),
+                dist_logcdf_fn(**point, value=value),
+                sp.logsumexp([dist_logp_fn(value=value, **point) for value in values]),
                 decimal=decimal,
-                err_msg=str(pt),
+                err_msg=str(point),
             )
 
 
@@ -598,7 +678,7 @@ def assert_moment_is_expected(model, expected, check_finite_logp=True):
         logp_moment = (
             joint_logp(
                 (model["x"],),
-                rvs_to_values={model["x"]: at.constant(moment)},
+                rvs_to_values={model["x"]: pt.constant(moment)},
                 rvs_to_transforms={},
             )[0]
             .sum()
@@ -607,7 +687,7 @@ def assert_moment_is_expected(model, expected, check_finite_logp=True):
         assert np.isfinite(logp_moment)
 
 
-def pymc_random(
+def continuous_random_tester(
     dist,
     paramdomains,
     ref_rand,
@@ -629,12 +709,12 @@ def pymc_random(
     pymc_rand = compile_pymc([], model_dist)
 
     domains = paramdomains.copy()
-    for pt in product(domains, n_samples=100):
-        pt = pm.Point(pt, model=model)
-        pt.update(model_args)
+    for point in product(domains, n_samples=100):
+        point = pm.Point(point, model=model)
+        point.update(model_args)
 
         # Update the shared parameter variables in `param_vars`
-        for k, v in pt.items():
+        for k, v in point.items():
             nv = param_vars.get(k, model.named_vars.get(k))
             if nv.name in param_vars:
                 param_vars[nv.name].set_value(v)
@@ -645,13 +725,13 @@ def pymc_random(
         f = fails
         while p <= alpha and f > 0:
             s0 = pymc_rand()
-            s1 = floatX(ref_rand(size=size, **pt))
+            s1 = floatX(ref_rand(size=size, **point))
             _, p = st.ks_2samp(np.atleast_1d(s0).flatten(), np.atleast_1d(s1).flatten())
             f -= 1
-        assert p > alpha, str(pt)
+        assert p > alpha, str(point)
 
 
-def pymc_random_discrete(
+def discrete_random_tester(
     dist,
     paramdomains,
     valuedomain=None,
@@ -668,12 +748,12 @@ def pymc_random_discrete(
     pymc_rand = compile_pymc([], model_dist)
 
     domains = paramdomains.copy()
-    for pt in product(domains, n_samples=100):
-        pt = pm.Point(pt, model=model)
+    for point in product(domains, n_samples=100):
+        point = pm.Point(point, model=model)
         p = alpha
 
         # Update the shared parameter variables in `param_vars`
-        for k, v in pt.items():
+        for k, v in point.items():
             nv = param_vars.get(k, model.named_vars.get(k))
             if nv.name in param_vars:
                 param_vars[nv.name].set_value(v)
@@ -683,7 +763,7 @@ def pymc_random_discrete(
         f = fails
         while p <= alpha and f > 0:
             o = pymc_rand()
-            e = intX(ref_rand(size=size, **pt))
+            e = intX(ref_rand(size=size, **point))
             o = np.atleast_1d(o).flatten()
             e = np.atleast_1d(e).flatten()
             bins = min(20, max(len(set(e)), len(set(o))))
@@ -695,12 +775,29 @@ def pymc_random_discrete(
             else:
                 _, p = st.chisquare(observed + 1, expected + 1)
             f -= 1
-        assert p > alpha, str(pt)
+        assert p > alpha, str(point)
+
+
+class SeededTest:
+    random_seed = 20160911
+    random_state = None
+
+    @classmethod
+    def setup_class(cls):
+        nr.seed(cls.random_seed)
+
+    def setup_method(self):
+        nr.seed(self.random_seed)
+
+    def get_random_state(self, reset=False):
+        if self.random_state is None or reset:
+            self.random_state = nr.RandomState(self.random_seed)
+        return self.random_state
 
 
 class BaseTestDistributionRandom(SeededTest):
     """
-    This class provides a base for tests that new RandomVariables are correctly
+    Base class for tests that new RandomVariables are correctly
     implemented, and that the mapping of parameters between the PyMC
     Distribution and the respective RandomVariable is correct.
 
@@ -762,7 +859,7 @@ class BaseTestDistributionRandom(SeededTest):
     reference_dist: Optional[Callable] = None
     reference_dist_params: Optional[dict] = None
     expected_rv_op_params: Optional[dict] = None
-    checks_to_run = []
+    checks_to_run: List[str] = []
     size = 15
     decimal = select_by_precision(float64=6, float32=3)
 
@@ -864,3 +961,21 @@ def seeded_numpy_distribution_builder(dist_name: str) -> Callable:
     return lambda self: ft.partial(
         getattr(np.random.RandomState, dist_name), self.get_random_state()
     )
+
+
+def assert_no_rvs(var):
+    """Assert that there are no `MeasurableVariable` nodes in a graph."""
+
+    def expand(r):
+        owner = r.owner
+        if owner:
+            inputs = list(reversed(owner.inputs))
+
+            if isinstance(owner.op, HasInnerGraph):
+                inputs += owner.op.inner_outputs
+
+            return inputs
+
+    for v in walk([var], expand, False):
+        if v.owner and isinstance(v.owner.op, (RandomVariable, MeasurableVariable)):
+            raise AssertionError(f"RV found in graph: {v}")
diff --git a/pymc/variational/approximations.py b/pymc/variational/approximations.py
index ec64c13fb8..1030052bc5 100644
--- a/pymc/variational/approximations.py
+++ b/pymc/variational/approximations.py
@@ -17,7 +17,7 @@
 import pytensor
 
 from arviz import InferenceData
-from pytensor import tensor as at
+from pytensor import tensor as pt
 from pytensor.graph.basic import Variable
 from pytensor.tensor.var import TensorVariable
 
@@ -60,7 +60,7 @@ def rho(self):
     @node_property
     def cov(self):
         var = rho2sigma(self.rho) ** 2
-        return at.diag(var)
+        return pt.diag(var)
 
     @node_property
     def std(self):
@@ -107,8 +107,8 @@ def symbolic_random(self):
     def symbolic_logq_not_scaled(self):
         z0 = self.symbolic_initial
         std = rho2sigma(self.rho)
-        logdet = at.log(std)
-        quaddist = -0.5 * z0**2 - at.log((2 * np.pi) ** 0.5)
+        logdet = pt.log(std)
+        quaddist = -0.5 * z0**2 - pt.log((2 * np.pi) ** 0.5)
         logq = quaddist - logdet
         return logq.sum(range(1, logq.ndim))
 
@@ -140,10 +140,10 @@ def create_shared_params(self, start=None):
 
     @node_property
     def L(self):
-        L = at.zeros((self.ddim, self.ddim))
-        L = at.set_subtensor(L[self.tril_indices], self.params_dict["L_tril"])
+        L = pt.zeros((self.ddim, self.ddim))
+        L = pt.set_subtensor(L[self.tril_indices], self.params_dict["L_tril"])
         Ld = L[..., np.arange(self.ddim), np.arange(self.ddim)]
-        L = at.set_subtensor(Ld, rho2sigma(Ld))
+        L = pt.set_subtensor(Ld, rho2sigma(Ld))
         return L
 
     @node_property
@@ -157,7 +157,7 @@ def cov(self):
 
     @node_property
     def std(self):
-        return at.sqrt(at.diag(self.cov))
+        return pt.sqrt(pt.diag(self.cov))
 
     @property
     def num_tril_entries(self):
@@ -171,9 +171,9 @@ def tril_indices(self):
     @node_property
     def symbolic_logq_not_scaled(self):
         z0 = self.symbolic_initial
-        diag = at.diagonal(self.L, 0, self.L.ndim - 2, self.L.ndim - 1)
-        logdet = at.log(diag)
-        quaddist = -0.5 * z0**2 - at.log((2 * np.pi) ** 0.5)
+        diag = pt.diagonal(self.L, 0, self.L.ndim - 2, self.L.ndim - 1)
+        logdet = pt.log(diag)
+        quaddist = -0.5 * z0**2 - pt.log((2 * np.pi) ** 0.5)
         logq = quaddist - logdet
         return logq.sum(range(1, logq.ndim))
 
@@ -251,7 +251,7 @@ def randidx(self, size=None):
                 pass
         else:
             size = tuple(np.atleast_1d(size))
-        return at.random.integers(
+        return pt.random.integers(
             size=size,
             low=0,
             high=self.histogram.shape[0],
@@ -262,11 +262,11 @@ def _new_initial(self, size, deterministic, more_replacements=None):
         pytensor_condition_is_here = isinstance(deterministic, Variable)
         if size is None:
             size = 1
-        size = at.as_tensor(size)
+        size = pt.as_tensor(size)
         if pytensor_condition_is_here:
-            return at.switch(
+            return pt.switch(
                 deterministic,
-                at.repeat(self.mean.reshape((1, -1)), size, -1),
+                pt.repeat(self.mean.reshape((1, -1)), size, -1),
                 self.histogram[self.randidx(size)],
             )
         else:
@@ -274,7 +274,7 @@ def _new_initial(self, size, deterministic, more_replacements=None):
                 raise NotImplementedInference(
                     "Deterministic sampling from a Histogram is broken in v4"
                 )
-                return at.repeat(self.mean.reshape((1, -1)), size, -1)
+                return pt.repeat(self.mean.reshape((1, -1)), size, -1)
             else:
                 return self.histogram[self.randidx(size)]
 
@@ -297,7 +297,7 @@ def cov(self):
 
     @node_property
     def std(self):
-        return at.sqrt(at.diag(self.cov))
+        return pt.sqrt(pt.diag(self.cov))
 
     def __str__(self):
         if isinstance(self.histogram, pytensor.compile.SharedVariable):
diff --git a/pymc/variational/minibatch_rv.py b/pymc/variational/minibatch_rv.py
index c5d2a85aca..0ee4b060ca 100644
--- a/pymc/variational/minibatch_rv.py
+++ b/pymc/variational/minibatch_rv.py
@@ -19,8 +19,12 @@
 from pytensor.graph import Apply, Op
 from pytensor.tensor import NoneConst, TensorVariable, as_tensor_variable
 
-from pymc.logprob.abstract import MeasurableVariable, _get_measurable_outputs, _logprob
-from pymc.logprob.abstract import logprob as logprob_logprob
+from pymc.logprob.abstract import (
+    MeasurableVariable,
+    _get_measurable_outputs,
+    _logprob,
+    _logprob_helper,
+)
 from pymc.logprob.utils import ignore_logprob
 
 
@@ -110,4 +114,4 @@ def _get_measurable_outputs_minibatch_random_variable(op, node):
 def minibatch_rv_logprob(op, values, *inputs, **kwargs):
     [value] = values
     rv, *total_size = inputs
-    return logprob_logprob(rv, value, **kwargs) * get_scaling(total_size, value.shape)
+    return _logprob_helper(rv, value, **kwargs) * get_scaling(total_size, value.shape)
diff --git a/pymc/variational/opvi.py b/pymc/variational/opvi.py
index 7bc4325f5e..972765156f 100644
--- a/pymc/variational/opvi.py
+++ b/pymc/variational/opvi.py
@@ -55,7 +55,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import xarray
 
 from pytensor.graph.basic import Variable
@@ -160,7 +160,7 @@ def try_to_set_test_value(node_in, node_out, s):
     _s = s
     if s is None:
         s = 1
-    s = pytensor.compile.view_op(at.as_tensor(s))
+    s = pytensor.compile.view_op(pt.as_tensor(s))
     if not isinstance(node_in, (list, tuple)):
         node_in = [node_in]
     if not isinstance(node_out, (list, tuple)):
@@ -805,7 +805,7 @@ def _check_user_params(self, **kwargs):
         spec = self.get_param_spec_for(d=self.ddim, **kwargs.pop("spec_kw", {}))
         for name, param in self.user_params.items():
             shape = spec[name]
-            self._user_params[name] = at.as_tensor(param).reshape(shape)
+            self._user_params[name] = pt.as_tensor(param).reshape(shape)
         return True
 
     def _initial_type(self, name):
@@ -819,7 +819,7 @@ def _initial_type(self, name):
         -------
         tensor
         """
-        return at.matrix(name)
+        return pt.matrix(name)
 
     def _input_type(self, name):
         R"""*Dev* - input type with given name. The correct type depends on `self.batched`
@@ -832,7 +832,7 @@ def _input_type(self, name):
         -------
         tensor
         """
-        return at.vector(name)
+        return pt.vector(name)
 
     @pytensor.config.change_flags(compute_test_value="off")
     def __init_group__(self, group):
@@ -909,7 +909,7 @@ def _new_initial_shape(self, size, dim, more_replacements=None):
         -------
         shape vector
         """
-        return at.stack([size, dim])
+        return pt.stack([size, dim])
 
     @node_property
     def ndim(self):
@@ -950,18 +950,18 @@ def _new_initial(self, size, deterministic, more_replacements=None):
             deterministic = np.int8(deterministic)
         dim, dist_name, dist_map = (self.ddim, self.initial_dist_name, self.initial_dist_map)
         dtype = self.symbolic_initial.dtype
-        dim = at.as_tensor(dim)
-        size = at.as_tensor(size)
+        dim = pt.as_tensor(dim)
+        size = pt.as_tensor(size)
         shape = self._new_initial_shape(size, dim, more_replacements)
         # apply optimizations if possible
         if not isinstance(deterministic, Variable):
             if deterministic:
-                return at.ones(shape, dtype) * dist_map
+                return pt.ones(shape, dtype) * dist_map
             else:
-                return getattr(at.random, dist_name)(size=shape)
+                return getattr(pt.random, dist_name)(size=shape)
         else:
-            sample = getattr(at.random, dist_name)(size=shape)
-            initial = at.switch(deterministic, at.ones(shape, dtype) * dist_map, sample)
+            sample = getattr(pt.random, dist_name)(size=shape)
+            initial = pt.switch(deterministic, pt.ones(shape, dtype) * dist_map, sample)
             return initial
 
     @node_property
@@ -1020,7 +1020,7 @@ def symbolic_sample_over_posterior(self, node):
         """
         node = self.to_flat_input(node)
         random = self.symbolic_random.astype(self.symbolic_initial.dtype)
-        random = at.specify_shape(random, self.symbolic_initial.type.shape)
+        random = pt.specify_shape(random, self.symbolic_initial.type.shape)
 
         def sample(post, *_):
             return pytensor.clone_replace(node, {self.input: post})
@@ -1058,7 +1058,7 @@ def make_size_and_deterministic_replacements(self, s, d, more_replacements=None)
         dict with replacements for initial
         """
         initial = self._new_initial(s, d, more_replacements)
-        initial = at.specify_shape(initial, self.symbolic_initial.type.shape)
+        initial = pt.specify_shape(initial, self.symbolic_initial.type.shape)
         if more_replacements:
             initial = pytensor.clone_replace(initial, more_replacements)
         return {self.symbolic_initial: initial}
@@ -1067,7 +1067,7 @@ def make_size_and_deterministic_replacements(self, s, d, more_replacements=None)
     def symbolic_normalizing_constant(self):
         """*Dev* - normalizing constant for `self.logq`, scales it to `minibatch_size` instead of `total_size`"""
         t = self.to_flat_input(
-            at.max(
+            pt.max(
                 [
                     get_scaling(v.owner.inputs[1:], v.shape)
                     for v in self.group
@@ -1110,21 +1110,21 @@ def __str__(self):
         return f"{self.__class__.__name__}[{shp}]"
 
     @node_property
-    def std(self) -> at.TensorVariable:
+    def std(self) -> pt.TensorVariable:
         """Standard deviation of the latent variables as an unstructured 1-dimensional tensor variable"""
         raise NotImplementedError()
 
     @node_property
-    def cov(self) -> at.TensorVariable:
+    def cov(self) -> pt.TensorVariable:
         """Covariance between the latent variables as an unstructured 2-dimensional tensor variable"""
         raise NotImplementedError()
 
     @node_property
-    def mean(self) -> at.TensorVariable:
+    def mean(self) -> pt.TensorVariable:
         """Mean of the latent variables as an unstructured 1-dimensional tensor variable"""
         raise NotImplementedError()
 
-    def var_to_data(self, shared: at.TensorVariable) -> xarray.Dataset:
+    def var_to_data(self, shared: pt.TensorVariable) -> xarray.Dataset:
         """Takes a flat 1-dimensional tensor variable and maps it to an xarray data set based on the information in
         `self.ordering`.
         """
@@ -1236,7 +1236,7 @@ def symbolic_normalizing_constant(self):
         """*Dev* - normalizing constant for `self.logq`, scales it to `minibatch_size` instead of `total_size`.
         Here the effect is controlled by `self.scale_cost_to_minibatch`
         """
-        t = at.max(
+        t = pt.max(
             self.collect("symbolic_normalizing_constant")
             + [
                 get_scaling(obs.owner.inputs[1:], obs.shape)
@@ -1244,18 +1244,18 @@ def symbolic_normalizing_constant(self):
                 if isinstance(obs.owner.op, MinibatchRandomVariable)
             ]
         )
-        t = at.switch(self._scale_cost_to_minibatch, t, at.constant(1, dtype=t.dtype))
+        t = pt.switch(self._scale_cost_to_minibatch, t, pt.constant(1, dtype=t.dtype))
         return pm.floatX(t)
 
     @node_property
     def symbolic_logq(self):
         """*Dev* - collects `symbolic_logq` for all groups"""
-        return at.add(*self.collect("symbolic_logq"))
+        return pt.add(*self.collect("symbolic_logq"))
 
     @node_property
     def logq(self):
         """*Dev* - collects `logQ` for all groups"""
-        return at.add(*self.collect("logq"))
+        return pt.add(*self.collect("logq"))
 
     @node_property
     def logq_norm(self):
@@ -1499,7 +1499,7 @@ def vars_names(vs):
 
     @node_property
     def sample_dict_fn(self):
-        s = at.iscalar()
+        s = pt.iscalar()
         names = [self.model.rvs_to_values[v].name for v in self.model.free_RVs]
         sampled = [self.rslice(name) for name in names]
         sampled = self.set_size_and_deterministic(sampled, s, 0)
@@ -1569,7 +1569,7 @@ def ddim(self):
 
     @node_property
     def symbolic_random(self):
-        return at.concatenate(self.collect("symbolic_random"), axis=-1)
+        return pt.concatenate(self.collect("symbolic_random"), axis=-1)
 
     def __str__(self):
         if len(self.groups) < 5:
@@ -1590,7 +1590,7 @@ def any_histograms(self):
     def joint_histogram(self):
         if not self.all_histograms:
             raise VariationalInferenceError("%s does not consist of all Empirical approximations")
-        return at.concatenate(self.collect("histogram"), axis=-1)
+        return pt.concatenate(self.collect("histogram"), axis=-1)
 
     @property
     def params(self):
diff --git a/pymc/variational/stein.py b/pymc/variational/stein.py
index 4768e8a52d..1f8c034f80 100644
--- a/pymc/variational/stein.py
+++ b/pymc/variational/stein.py
@@ -13,7 +13,7 @@
 #   limitations under the License.
 
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pymc.pytensorf import floatX
 from pymc.util import WithMemoization, locally_cachedmethod
@@ -47,12 +47,12 @@ def approx_symbolic_matrices(self):
     @node_property
     def dlogp(self):
         logp = self.logp_norm.sum()
-        grad = at.grad(logp, self.approx_symbolic_matrices)
+        grad = pt.grad(logp, self.approx_symbolic_matrices)
 
         def flatten2(tensor):
             return tensor.flatten(2)
 
-        return at.concatenate(list(map(flatten2, grad)), -1)
+        return pt.concatenate(list(map(flatten2, grad)), -1)
 
     @node_property
     def grad(self):
@@ -65,7 +65,7 @@ def grad(self):
     def density_part_grad(self):
         Kxy = self.Kxy
         dlogpdx = self.dlogp
-        return at.dot(Kxy, dlogpdx)
+        return pt.dot(Kxy, dlogpdx)
 
     @node_property
     def repulsive_part_grad(self):
diff --git a/pymc/variational/test_functions.py b/pymc/variational/test_functions.py
index 33abc6195d..65dd36b32f 100644
--- a/pymc/variational/test_functions.py
+++ b/pymc/variational/test_functions.py
@@ -12,7 +12,7 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-from pytensor import tensor as at
+from pytensor import tensor as pt
 
 from pymc.pytensorf import floatX
 from pymc.variational.opvi import TestFunction
@@ -34,30 +34,30 @@ class Kernel(TestFunction):
 class RBF(Kernel):
     def __call__(self, X):
         XY = X.dot(X.T)
-        x2 = at.sum(X**2, axis=1).dimshuffle(0, "x")
-        X2e = at.repeat(x2, X.shape[0], axis=1)
+        x2 = pt.sum(X**2, axis=1).dimshuffle(0, "x")
+        X2e = pt.repeat(x2, X.shape[0], axis=1)
         H = X2e + X2e.T - 2.0 * XY
 
-        V = at.sort(H.flatten())
+        V = pt.sort(H.flatten())
         length = V.shape[0]
         # median distance
-        m = at.switch(
-            at.eq((length % 2), 0),
+        m = pt.switch(
+            pt.eq((length % 2), 0),
             # if even vector
-            at.mean(V[((length // 2) - 1) : ((length // 2) + 1)]),
+            pt.mean(V[((length // 2) - 1) : ((length // 2) + 1)]),
             # if odd vector
             V[length // 2],
         )
 
-        h = 0.5 * m / at.log(floatX(H.shape[0]) + floatX(1))
+        h = 0.5 * m / pt.log(floatX(H.shape[0]) + floatX(1))
 
         #  RBF
-        Kxy = at.exp(-H / h / 2.0)
+        Kxy = pt.exp(-H / h / 2.0)
 
         # Derivative
-        dxkxy = -at.dot(Kxy, X)
-        sumkxy = at.sum(Kxy, axis=-1, keepdims=True)
-        dxkxy = at.add(dxkxy, at.mul(X, sumkxy)) / h
+        dxkxy = -pt.dot(Kxy, X)
+        sumkxy = pt.sum(Kxy, axis=-1, keepdims=True)
+        dxkxy = pt.add(dxkxy, pt.mul(X, sumkxy)) / h
 
         return Kxy, dxkxy
 
diff --git a/pymc/variational/updates.py b/pymc/variational/updates.py
index 7dae049446..a6d049608e 100644
--- a/pymc/variational/updates.py
+++ b/pymc/variational/updates.py
@@ -94,11 +94,11 @@
 >>> from lasagne.updates import sgd, apply_momentum
 >>> l_in = InputLayer((100, 20))
 >>> l1 = DenseLayer(l_in, num_units=3, nonlinearity=softmax)
->>> x = at.matrix('x')  # shp: num_batch x num_features
->>> y = at.ivector('y') # shp: num_batch
+>>> x = pt.matrix('x')  # shp: num_batch x num_features
+>>> y = pt.ivector('y') # shp: num_batch
 >>> l_out = get_output(l1, x)
 >>> params = lasagne.layers.get_all_params(l1)
->>> loss = at.mean(at.nnet.categorical_crossentropy(l_out, y))
+>>> loss = pt.mean(pt.nnet.categorical_crossentropy(l_out, y))
 >>> updates_sgd = sgd(loss, params, learning_rate=0.0001)
 >>> updates = apply_momentum(updates_sgd, params, momentum=0.9)
 >>> train_function = pytensor.function([x, y], updates=updates)
@@ -113,7 +113,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 import pymc as pm
 
@@ -533,7 +533,7 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6):
         accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape)
         accu_new = accu + grad**2
         updates[accu] = accu_new
-        updates[param] = param - (learning_rate * grad / at.sqrt(accu_new + epsilon))
+        updates[param] = param - (learning_rate * grad / pt.sqrt(accu_new + epsilon))
 
     return updates
 
@@ -573,13 +573,13 @@ def adagrad_window(loss_or_grads=None, params=None, learning_rate=0.001, epsilon
         accu = pytensor.shared(np.zeros(value.shape + (n_win,), dtype=value.dtype))
 
         # Append squared gradient vector to accu_new
-        accu_new = at.set_subtensor(accu[..., i_int], grad**2)
-        i_new = at.switch((i + 1) < n_win, i + 1, 0)
+        accu_new = pt.set_subtensor(accu[..., i_int], grad**2)
+        i_new = pt.switch((i + 1) < n_win, i + 1, 0)
         updates[accu] = accu_new
         updates[i] = i_new
 
         accu_sum = accu_new.sum(axis=-1)
-        updates[param] = param - (learning_rate * grad / at.sqrt(accu_sum + epsilon))
+        updates[param] = param - (learning_rate * grad / pt.sqrt(accu_sum + epsilon))
     return updates
 
 
@@ -652,14 +652,14 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon
     updates = OrderedDict()
 
     # Using pytensor constant to prevent upcasting of float32
-    one = at.constant(1)
+    one = pt.constant(1)
 
     for param, grad in zip(params, grads):
         value = param.get_value(borrow=True)
         accu = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape)
         accu_new = rho * accu + (one - rho) * grad**2
         updates[accu] = accu_new
-        updates[param] = param - (learning_rate * grad / at.sqrt(accu_new + epsilon))
+        updates[param] = param - (learning_rate * grad / pt.sqrt(accu_new + epsilon))
 
     return updates
 
@@ -742,7 +742,7 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil
     updates = OrderedDict()
 
     # Using pytensor constant to prevent upcasting of float32
-    one = at.constant(1)
+    one = pt.constant(1)
 
     for param, grad in zip(params, grads):
         value = param.get_value(borrow=True)
@@ -758,7 +758,7 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil
         updates[accu] = accu_new
 
         # compute parameter update, using the 'old' delta_accu
-        update = grad * at.sqrt(delta_accu + epsilon) / at.sqrt(accu_new + epsilon)
+        update = grad * pt.sqrt(delta_accu + epsilon) / pt.sqrt(accu_new + epsilon)
         updates[param] = param - learning_rate * update
 
         # update delta_accu (as accu, but accumulating updates)
@@ -833,10 +833,10 @@ def adam(
     updates = OrderedDict()
 
     # Using pytensor constant to prevent upcasting of float32
-    one = at.constant(1)
+    one = pt.constant(1)
 
     t = t_prev + 1
-    a_t = learning_rate * at.sqrt(one - beta2**t) / (one - beta1**t)
+    a_t = learning_rate * pt.sqrt(one - beta2**t) / (one - beta1**t)
 
     for param, g_t in zip(params, all_grads):
         value = param.get_value(borrow=True)
@@ -845,7 +845,7 @@ def adam(
 
         m_t = beta1 * m_prev + (one - beta1) * g_t
         v_t = beta2 * v_prev + (one - beta2) * g_t**2
-        step = a_t * m_t / (at.sqrt(v_t) + epsilon)
+        step = a_t * m_t / (pt.sqrt(v_t) + epsilon)
 
         updates[m_prev] = m_t
         updates[v_prev] = v_t
@@ -917,7 +917,7 @@ def adamax(
     updates = OrderedDict()
 
     # Using pytensor constant to prevent upcasting of float32
-    one = at.constant(1)
+    one = pt.constant(1)
 
     t = t_prev + 1
     a_t = learning_rate / (one - beta1**t)
@@ -928,7 +928,7 @@ def adamax(
         u_prev = pytensor.shared(np.zeros(value.shape, dtype=value.dtype), shape=param.type.shape)
 
         m_t = beta1 * m_prev + (one - beta1) * g_t
-        u_t = at.maximum(beta2 * u_prev, abs(g_t))
+        u_t = pt.maximum(beta2 * u_prev, abs(g_t))
         step = a_t * m_t / (u_t + epsilon)
 
         updates[m_prev] = m_t
@@ -1010,8 +1010,8 @@ def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7):
         )
 
     dtype = np.dtype(pytensor.config.floatX).type
-    norms = at.sqrt(at.sum(at.sqr(tensor_var), axis=sum_over, keepdims=True))
-    target_norms = at.clip(norms, 0, dtype(max_norm))
+    norms = pt.sqrt(pt.sum(pt.sqr(tensor_var), axis=sum_over, keepdims=True))
+    target_norms = pt.clip(norms, 0, dtype(max_norm))
     constrained_output = tensor_var * (target_norms / (dtype(epsilon) + norms))
 
     return constrained_output
@@ -1051,14 +1051,14 @@ def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, return_norm=False
     >>> from lasagne.layers import InputLayer, DenseLayer
     >>> import lasagne
     >>> from lasagne.updates import sgd, total_norm_constraint
-    >>> x = at.matrix()
-    >>> y = at.ivector()
+    >>> x = pt.matrix()
+    >>> y = pt.ivector()
     >>> l_in = InputLayer((5, 10))
-    >>> l1 = DenseLayer(l_in, num_units=7, nonlinearity=at.special.softmax)
+    >>> l1 = DenseLayer(l_in, num_units=7, nonlinearity=pt.special.softmax)
     >>> output = lasagne.layers.get_output(l1, x)
-    >>> cost = at.mean(at.nnet.categorical_crossentropy(output, y))
+    >>> cost = pt.mean(pt.nnet.categorical_crossentropy(output, y))
     >>> all_params = lasagne.layers.get_all_params(l1)
-    >>> all_grads = at.grad(cost, all_params)
+    >>> all_grads = pt.grad(cost, all_params)
     >>> scaled_grads = total_norm_constraint(all_grads, 5)
     >>> updates = sgd(scaled_grads, all_params, learning_rate=0.1)
 
@@ -1072,9 +1072,9 @@ def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, return_norm=False
        learning with neural networks. In Advances in Neural Information
        Processing Systems (pp. 3104-3112).
     """
-    norm = at.sqrt(sum(at.sum(tensor**2) for tensor in tensor_vars))
+    norm = pt.sqrt(sum(pt.sum(tensor**2) for tensor in tensor_vars))
     dtype = np.dtype(pytensor.config.floatX).type
-    target_norm = at.clip(norm, 0, dtype(max_norm))
+    target_norm = pt.clip(norm, 0, dtype(max_norm))
     multiplier = target_norm / (dtype(epsilon) + norm)
     tensor_vars_scaled = [step * multiplier for step in tensor_vars]
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 17dd9ba816..d583a69806 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -18,7 +18,7 @@ numpydoc
 pandas>=0.24.0
 polyagamma
 pre-commit>=2.8.0
-pytensor==2.10.1
+pytensor>=2.11.0,<2.12
 pytest-cov>=2.5
 pytest>=3.0
 scipy>=1.4.1
diff --git a/requirements.txt b/requirements.txt
index dd11544039..f3d4bb6701 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,6 @@ cloudpickle
 fastprogress>=0.2.0
 numpy>=1.15.0
 pandas>=0.24.0
-pytensor==2.10.1
+pytensor>=2.11.0,<2.12
 scipy>=1.4.1
 typing-extensions>=3.7.4
diff --git a/scripts/dev.Dockerfile b/scripts/dev.Dockerfile
index 624149455d..0d6fd204a2 100644
--- a/scripts/dev.Dockerfile
+++ b/scripts/dev.Dockerfile
@@ -1,4 +1,4 @@
-FROM ghcr.io/mamba-org/micromamba-devcontainer:git-5185ae9
+FROM ghcr.io/mamba-org/micromamba-devcontainer:git-e04d158
 
 COPY --chown=${MAMBA_USER}:${MAMBA_USER} conda-envs/environment-dev.yml /tmp/environment-dev.yml
 RUN : \
@@ -10,12 +10,13 @@ RUN : \
 
 ARG MAMBA_DOCKERFILE_ACTIVATE=1
 
+ENV PRE_COMMIT_HOME=/opt/.pre-commit-cache-prebuilt
 COPY --chown=${MAMBA_USER}:${MAMBA_USER} .pre-commit-config.yaml /fake-repo/.pre-commit-config.yaml
 RUN : \
     && sudo mkdir --mode=777 /opt/.pre-commit-cache-prebuilt \
     && cd /fake-repo \
     && git init \
-    && PRE_COMMIT_HOME=/opt/.pre-commit-cache-prebuilt pre-commit install-hooks \
+    && pre-commit install-hooks \
     && sudo rm -rf /fake-repo \
     && sudo chmod -R a+rwx /opt/.pre-commit-cache-prebuilt \
 ;
diff --git a/scripts/run_mypy.py b/scripts/run_mypy.py
index 3774ad8333..2ad9c8a6f9 100644
--- a/scripts/run_mypy.py
+++ b/scripts/run_mypy.py
@@ -30,7 +30,7 @@
 pymc/distributions/truncated.py
 pymc/initial_point.py
 pymc/logprob/censoring.py
-pymc/logprob/joint_logprob.py
+pymc/logprob/basic.py
 pymc/logprob/mixture.py
 pymc/logprob/rewriting.py
 pymc/logprob/scan.py
diff --git a/setup.py b/setup.py
index c162e20654..5eedc9fb4d 100755
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
     "License :: OSI Approved :: Apache Software License",
     "Intended Audience :: Science/Research",
     "Topic :: Scientific/Engineering",
@@ -65,11 +66,10 @@
         url=URL,
         long_description=LONG_DESCRIPTION,
         long_description_content_type="text/x-rst",
-        packages=find_packages(),
+        packages=find_packages(exclude=["tests*"]),
         # because of an upload-size limit by PyPI, we're temporarily removing docs from the tarball.
         # Also see MANIFEST.in
         # package_data={'docs': ['*']},
-        include_package_data=True,
         classifiers=classifiers,
         python_requires=">=3.8",
         install_requires=install_reqs,
diff --git a/tests/backends/test_arviz.py b/tests/backends/test_arviz.py
index 73c3e9b5cf..16b6bb9e86 100644
--- a/tests/backends/test_arviz.py
+++ b/tests/backends/test_arviz.py
@@ -17,7 +17,7 @@
 from typing import Dict, Tuple
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from arviz import InferenceData
@@ -643,7 +643,7 @@ def test_issue_5043_autoconvert_coord_values(self):
     def test_variable_dimension_name_collision(self):
         with pytest.raises(ValueError, match="same name as its dimension"):
             with pm.Model() as pmodel:
-                var = at.as_tensor([1, 2, 3])
+                var = pt.as_tensor([1, 2, 3])
                 pmodel.register_rv(var, name="time", dims=("time",))
 
     def test_include_transformed(self):
diff --git a/tests/distributions/test_continuous.py b/tests/distributions/test_continuous.py
index dc4c88b567..1f673eb285 100644
--- a/tests/distributions/test_continuous.py
+++ b/tests/distributions/test_continuous.py
@@ -13,11 +13,12 @@
 #   limitations under the License.
 
 import functools as ft
+import warnings
 
 import numpy as np
 import numpy.testing as npt
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.special as sp
 import scipy.stats as st
@@ -26,13 +27,12 @@
 
 import pymc as pm
 
-from pymc.distributions.continuous import Normal, get_tau_sigma, interpolated
+from pymc.distributions.continuous import Normal, Uniform, get_tau_sigma, interpolated
 from pymc.distributions.dist_math import clipped_beta_rvs
-from pymc.logprob.abstract import logcdf
-from pymc.logprob.joint_logprob import logp
+from pymc.logprob.basic import icdf, logcdf, logp
 from pymc.logprob.utils import ParameterValueError
 from pymc.pytensorf import floatX
-from tests.distributions.util import (
+from pymc.testing import (
     BaseTestDistributionRandom,
     Circ,
     Domain,
@@ -43,13 +43,14 @@
     Runif,
     Unit,
     assert_moment_is_expected,
+    check_icdf,
     check_logcdf,
     check_logp,
-    pymc_random,
+    continuous_random_tester,
     seeded_numpy_distribution_builder,
     seeded_scipy_distribution_builder,
+    select_by_precision,
 )
-from tests.helpers import select_by_precision
 from tests.logprob.utils import create_pytensor_params, scipy_logprob_tester
 
 try:
@@ -159,14 +160,6 @@ def laplace_asymmetric_logpdf(value, kappa, b, mu):
     return lPx
 
 
-def beta_mu_sigma(value, mu, sigma):
-    kappa = mu * (1 - mu) / sigma**2 - 1
-    if kappa > 0:
-        return st.beta.logpdf(value, mu * kappa, (1 - mu) * kappa)
-    else:
-        return -np.inf
-
-
 class TestMatchesScipy:
     def test_uniform(self):
         check_logp(
@@ -183,6 +176,12 @@ def test_uniform(self):
             lambda value, lower, upper: st.uniform.logcdf(value, lower, upper - lower),
             skip_paramdomain_outside_edge_test=True,
         )
+        check_icdf(
+            pm.Uniform,
+            {"lower": -Rplusunif, "upper": Rplusunif},
+            lambda q, lower, upper: st.uniform.ppf(q=q, loc=lower, scale=upper - lower),
+            skip_paramdomain_outside_edge_test=True,
+        )
         # Custom logp / logcdf check for invalid parameters
         invalid_dist = pm.Uniform.dist(lower=1, upper=0)
         with pytensor.config.change_flags(mode=Mode("py")):
@@ -190,6 +189,8 @@ def test_uniform(self):
                 logp(invalid_dist, np.array(0.5)).eval()
             with pytest.raises(ParameterValueError):
                 logcdf(invalid_dist, np.array(0.5)).eval()
+            with pytest.raises(ParameterValueError):
+                icdf(invalid_dist, np.array(0.5)).eval()
 
     def test_triangular(self):
         check_logp(
@@ -278,6 +279,11 @@ def test_normal(self):
             lambda value, mu, sigma: st.norm.logcdf(value, mu, sigma),
             decimal=select_by_precision(float64=6, float32=1),
         )
+        check_icdf(
+            pm.Normal,
+            {"mu": R, "sigma": Rplus},
+            lambda q, mu, sigma: st.norm.ppf(q, mu, sigma),
+        )
 
     def test_half_normal(self):
         check_logp(
@@ -356,9 +362,11 @@ def test_wald_logp_custom_points(self, value, mu, lam, phi, alpha, logp):
         # http://www.gamlss.org/.
         with pm.Model() as model:
             pm.Wald("wald", mu=mu, lam=lam, phi=phi, alpha=alpha, transform=None)
-        pt = {"wald": value}
+        point = {"wald": value}
         decimals = select_by_precision(float64=6, float32=1)
-        npt.assert_almost_equal(model.compile_logp()(pt), logp, decimal=decimals, err_msg=str(pt))
+        npt.assert_almost_equal(
+            model.compile_logp()(point), logp, decimal=decimals, err_msg=str(point)
+        )
 
     def test_beta_logp(self):
         check_logp(
@@ -367,10 +375,18 @@ def test_beta_logp(self):
             {"alpha": Rplus, "beta": Rplus},
             lambda value, alpha, beta: st.beta.logpdf(value, alpha, beta),
         )
+
+        def beta_mu_sigma(value, mu, sigma):
+            kappa = mu * (1 - mu) / sigma**2 - 1
+            return st.beta.logpdf(value, mu * kappa, (1 - mu) * kappa)
+
+        # The mu/sigma parametrization is not always valid
+        safe_mu_domain = Domain([0, 0.3, 0.5, 0.8, 1])
+        safe_sigma_domain = Domain([0, 0.05, 0.1, np.inf])
         check_logp(
             pm.Beta,
             Unit,
-            {"mu": Unit, "sigma": Rplus},
+            {"mu": safe_mu_domain, "sigma": safe_sigma_domain},
             beta_mu_sigma,
         )
 
@@ -422,6 +438,11 @@ def test_exponential(self):
             {"lam": Rplus},
             lambda value, lam: st.expon.logcdf(value, 0, 1 / lam),
         )
+        check_icdf(
+            pm.Exponential,
+            {"lam": Rplus},
+            lambda q, lam: st.expon.ppf(q, loc=0, scale=1 / lam),
+        )
 
     def test_laplace(self):
         check_logp(
@@ -875,24 +896,26 @@ def scipy_logp(value, mu, sigma, lower, upper):
         assert np.isinf(logp[2])
 
     def test_get_tau_sigma(self):
-        sigma = np.array(2)
-        npt.assert_almost_equal(get_tau_sigma(sigma=sigma), [1.0 / sigma**2, sigma])
+        # Fail on warnings
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
 
-        tau = np.array(2)
-        npt.assert_almost_equal(get_tau_sigma(tau=tau), [tau, tau**-0.5])
+            sigma = np.array(2)
+            npt.assert_almost_equal(get_tau_sigma(sigma=sigma), [1.0 / sigma**2, sigma])
 
-        tau, _ = get_tau_sigma(sigma=at.constant(-2))
-        with pytest.raises(ParameterValueError):
-            tau.eval()
+            tau = np.array(2)
+            npt.assert_almost_equal(get_tau_sigma(tau=tau), [tau, tau**-0.5])
 
-        _, sigma = get_tau_sigma(tau=at.constant(-2))
-        with pytest.raises(ParameterValueError):
-            sigma.eval()
+            tau, _ = get_tau_sigma(sigma=pt.constant(-2))
+            npt.assert_almost_equal(tau.eval(), -0.25)
 
-        sigma = [1, 2]
-        npt.assert_almost_equal(
-            get_tau_sigma(sigma=sigma), [1.0 / np.array(sigma) ** 2, np.array(sigma)]
-        )
+            _, sigma = get_tau_sigma(tau=pt.constant(-2))
+            npt.assert_almost_equal(sigma.eval(), -np.sqrt(1 / 2))
+
+            sigma = [1, 2]
+            npt.assert_almost_equal(
+                get_tau_sigma(sigma=sigma), [1.0 / np.array(sigma) ** 2, np.array(sigma)]
+            )
 
     @pytest.mark.parametrize(
         "value,mu,sigma,nu,logp",
@@ -915,12 +938,12 @@ def test_ex_gaussian(self, value, mu, sigma, nu, logp):
         See e.g., doi: 10.1111/j.1467-9876.2005.00510.x, or http://www.gamlss.org/."""
         with pm.Model() as model:
             pm.ExGaussian("eg", mu=mu, sigma=sigma, nu=nu)
-        pt = {"eg": value}
+        point = {"eg": value}
         npt.assert_almost_equal(
-            model.compile_logp()(pt),
+            model.compile_logp()(point),
             logp,
             decimal=select_by_precision(float64=6, float32=2),
-            err_msg=str(pt),
+            err_msg=str(point),
         )
 
 
@@ -1503,6 +1526,10 @@ def test_rice_moment(self, nu, sigma, size, expected):
         with pm.Model() as model:
             pm.Rice("x", nu=nu, sigma=sigma, size=size)
 
+    @pytest.mark.skipif(
+        condition=_polyagamma_not_installed,
+        reason="`polyagamma package is not available/installed.",
+    )
     @pytest.mark.parametrize(
         "h, z, size, expected",
         [
@@ -1722,6 +1749,22 @@ class TestStudentT(BaseTestDistributionRandom):
     ]
 
 
+class TestHalfStudentT(BaseTestDistributionRandom):
+    def halfstudentt_rng_fn(self, df, loc, scale, size, rng):
+        return np.abs(st.t.rvs(df=df, loc=loc, scale=scale, size=size))
+
+    pymc_dist = pm.HalfStudentT
+    pymc_dist_params = {"nu": 5.0, "sigma": 2.0}
+    expected_rv_op_params = {"nu": 5.0, "sigma": 2.0}
+    reference_dist_params = {"df": 5.0, "loc": 0, "scale": 2.0}
+    reference_dist = lambda self: ft.partial(self.halfstudentt_rng_fn, rng=self.get_random_state())
+    checks_to_run = [
+        "check_pymc_params_match_rv_op",
+        "check_pymc_draws_match_reference",
+        "check_rv_size",
+    ]
+
+
 class TestMoyal(BaseTestDistributionRandom):
     pymc_dist = pm.Moyal
     pymc_dist_params = {"mu": 0.0, "sigma": 1.0}
@@ -2259,27 +2302,9 @@ def dist(cls, **kwargs):
                         pdf_points = st.norm.pdf(x_points, loc=mu, scale=sigma)
                         return super().dist(x_points=x_points, pdf_points=pdf_points, **kwargs)
 
-                pymc_random(
+                continuous_random_tester(
                     TestedInterpolated,
                     {},
                     extra_args={"rng": pytensor.shared(rng)},
                     ref_rand=ref_rand,
                 )
-
-
-class TestICDF:
-    @pytest.mark.parametrize(
-        "dist_params, obs, size",
-        [
-            ((0, 1), np.array([-0.5, 0, 0.3, 0.5, 1, 1.5], dtype=np.float64), ()),
-            ((-1, 20), np.array([-0.5, 0, 0.3, 0.5, 1, 1.5], dtype=np.float64), ()),
-            ((-1, 20), np.array([-0.5, 0, 0.3, 0.5, 1, 1.5], dtype=np.float64), (2, 3)),
-        ],
-    )
-    def test_normal_icdf(self, dist_params, obs, size):
-        dist_params_at, obs_at, size_at = create_pytensor_params(dist_params, obs, size)
-        dist_params = dict(zip(dist_params_at, dist_params))
-
-        x = Normal.dist(*dist_params_at, size=size_at)
-
-        scipy_logprob_tester(x, obs, dist_params, test_fn=st.norm.ppf, test="icdf")
diff --git a/tests/distributions/test_discrete.py b/tests/distributions/test_discrete.py
index 7b15a6b587..6233adf3a4 100644
--- a/tests/distributions/test_discrete.py
+++ b/tests/distributions/test_discrete.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.special as sp
 import scipy.stats as st
@@ -29,12 +29,10 @@
 import pymc as pm
 
 from pymc.distributions.discrete import Geometric, _OrderedLogistic, _OrderedProbit
-from pymc.logprob.abstract import logcdf
-from pymc.logprob.joint_logprob import logp
+from pymc.logprob.basic import icdf, logcdf, logp
 from pymc.logprob.utils import ParameterValueError
 from pymc.pytensorf import floatX
-from pymc.vartypes import discrete_types
-from tests.distributions.util import (
+from pymc.testing import (
     BaseTestDistributionRandom,
     Bool,
     Domain,
@@ -52,12 +50,14 @@
     UnitSortedVector,
     Vector,
     assert_moment_is_expected,
+    check_icdf,
     check_logcdf,
     check_logp,
     check_selfconsistency_discrete_logcdf,
     seeded_numpy_distribution_builder,
     seeded_scipy_distribution_builder,
 )
+from pymc.vartypes import discrete_types
 from tests.logprob.utils import create_pytensor_params, scipy_logprob_tester
 
 
@@ -117,6 +117,12 @@ def test_discrete_unif(self):
             Domain([-10, 0, 10], "int64"),
             {"lower": -Rplusdunif, "upper": Rplusdunif},
         )
+        check_icdf(
+            pm.DiscreteUniform,
+            {"lower": -Rplusdunif, "upper": Rplusdunif},
+            lambda q, lower, upper: st.randint.ppf(q=q, low=lower, high=upper + 1),
+            skip_paramdomain_outside_edge_test=True,
+        )
         # Custom logp / logcdf check for invalid parameters
         invalid_dist = pm.DiscreteUniform.dist(lower=1, upper=0)
         with pytensor.config.change_flags(mode=Mode("py")):
@@ -124,6 +130,8 @@ def test_discrete_unif(self):
                 logp(invalid_dist, 0.5).eval()
             with pytest.raises(ParameterValueError):
                 logcdf(invalid_dist, 2).eval()
+            with pytest.raises(ParameterValueError):
+                icdf(invalid_dist, np.array(1)).eval()
 
     def test_geometric(self):
         check_logp(
@@ -143,15 +151,14 @@ def test_geometric(self):
             Nat,
             {"p": Unit},
         )
+        check_icdf(
+            pm.Geometric,
+            {"p": Unit},
+            st.geom.ppf,
+        )
 
     def test_hypergeometric(self):
-        def modified_scipy_hypergeom_logpmf(value, N, k, n):
-            # Convert nan to -np.inf
-            original_res = st.hypergeom.logpmf(value, N, k, n)
-            return original_res if not np.isnan(original_res) else -np.inf
-
         def modified_scipy_hypergeom_logcdf(value, N, k, n):
-            # Convert nan to -np.inf
             original_res = st.hypergeom.logcdf(value, N, k, n)
 
             # Correct for scipy bug in logcdf method (see https://github.com/scipy/scipy/issues/13280)
@@ -160,24 +167,27 @@ def modified_scipy_hypergeom_logcdf(value, N, k, n):
                 if np.all(np.isnan(pmfs)):
                     original_res = np.nan
 
-            return original_res if not np.isnan(original_res) else -np.inf
+            return original_res
+
+        N_domain = Domain([0, 10, 20, 30, np.inf], dtype="int64")
+        n_domain = k_domain = Domain([0, 1, 2, 3, np.inf], dtype="int64")
 
         check_logp(
             pm.HyperGeometric,
             Nat,
-            {"N": NatSmall, "k": NatSmall, "n": NatSmall},
-            modified_scipy_hypergeom_logpmf,
+            {"N": N_domain, "k": k_domain, "n": n_domain},
+            lambda value, N, k, n: st.hypergeom.logpmf(value, N, k, n),
         )
         check_logcdf(
             pm.HyperGeometric,
             Nat,
-            {"N": NatSmall, "k": NatSmall, "n": NatSmall},
+            {"N": N_domain, "k": k_domain, "n": n_domain},
             modified_scipy_hypergeom_logcdf,
         )
         check_selfconsistency_discrete_logcdf(
             pm.HyperGeometric,
             Nat,
-            {"N": NatSmall, "k": NatSmall, "n": NatSmall},
+            {"N": N_domain, "k": k_domain, "n": n_domain},
         )
 
     @pytest.mark.xfail(
@@ -499,7 +509,7 @@ def test_categorical_bounds(self):
             # entries if there is a single or pair number of negative values
             # and the rest are zero
             np.array([-1, -1, 0, 0]),
-            at.as_tensor_variable([-1, -1, 0, 0]),
+            pt.as_tensor_variable([-1, -1, 0, 0]),
         ],
     )
     def test_categorical_negative_p(self, p):
@@ -518,7 +528,7 @@ def test_categorical_p_not_normalized(self):
     def test_categorical_negative_p_symbolic(self):
         value = np.array([[1, 1, 1]])
 
-        x = at.scalar("x")
+        x = pt.scalar("x")
         invalid_dist = pm.Categorical.dist(p=[x, x, x])
 
         with pytest.raises(ParameterValueError):
@@ -527,7 +537,7 @@ def test_categorical_negative_p_symbolic(self):
     def test_categorical_p_not_normalized_symbolic(self):
         value = np.array([[1, 1, 1]])
 
-        x = at.scalar("x")
+        x = pt.scalar("x")
         invalid_dist = pm.Categorical.dist(p=(x, x, x))
 
         with pytest.raises(ParameterValueError):
@@ -535,15 +545,17 @@ def test_categorical_p_not_normalized_symbolic(self):
 
     @pytest.mark.parametrize("n", [2, 3, 4])
     def test_orderedlogistic(self, n):
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", "invalid value encountered in log", RuntimeWarning)
-            warnings.filterwarnings("ignore", "divide by zero encountered in log", RuntimeWarning)
-            check_logp(
-                pm.OrderedLogistic,
-                Domain(range(n), dtype="int64", edges=(None, None)),
-                {"eta": R, "cutpoints": Vector(R, n - 1)},
-                lambda value, eta, cutpoints: orderedlogistic_logpdf(value, eta, cutpoints),
-            )
+        cutpoints_domain = Vector(R, n - 1)
+        # Filter out invalid non-monotonic values
+        cutpoints_domain.vals = [v for v in cutpoints_domain.vals if np.all(np.diff(v) > 0)]
+        assert len(cutpoints_domain.vals) > 0
+
+        check_logp(
+            pm.OrderedLogistic,
+            Domain(range(n), dtype="int64", edges=(None, None)),
+            {"eta": R, "cutpoints": cutpoints_domain},
+            lambda value, eta, cutpoints: orderedlogistic_logpdf(value, eta, cutpoints),
+        )
 
     @pytest.mark.parametrize("n", [2, 3, 4])
     def test_orderedprobit(self, n):
@@ -1149,29 +1161,3 @@ def test_shape_inputs(self, eta, cutpoints, sigma, expected):
         )
         p = categorical.owner.inputs[3].eval()
         assert p.shape == expected
-
-
-class TestICDF:
-    @pytest.mark.parametrize(
-        "dist_params, obs, size",
-        [
-            ((0.1,), np.array([-0.5, 0, 0.1, 0.5, 0.9, 1.0, 1.5], dtype=np.int64), ()),
-            ((0.5,), np.array([-0.5, 0, 0.1, 0.5, 0.9, 1.0, 1.5], dtype=np.int64), (3, 2)),
-            (
-                (np.array([0.0, 0.2, 0.5, 1.0]),),
-                np.array([0.7, 0.7, 0.7, 0.7], dtype=np.int64),
-                (),
-            ),
-        ],
-    )
-    def test_geometric_icdf(self, dist_params, obs, size):
-        dist_params_at, obs_at, size_at = create_pytensor_params(dist_params, obs, size)
-        dist_params = dict(zip(dist_params_at, dist_params))
-
-        x = Geometric.dist(*dist_params_at, size=size_at)
-
-        def scipy_geom_icdf(value, p):
-            # Scipy ppf returns floats
-            return st.geom.ppf(value, p).astype(value.dtype)
-
-        scipy_logprob_tester(x, obs, dist_params, test_fn=scipy_geom_icdf, test="icdf")
diff --git a/tests/distributions/test_dist_math.py b/tests/distributions/test_dist_math.py
index 0693ccf2e9..f0b6d8f87f 100644
--- a/tests/distributions/test_dist_math.py
+++ b/tests/distributions/test_dist_math.py
@@ -14,7 +14,7 @@
 import numpy as np
 import numpy.testing as npt
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.special
 
@@ -46,16 +46,16 @@
     [
         ([], True),
         ([True], True),
-        ([at.ones(10)], True),
-        ([at.ones(10), 5 * at.ones(101)], True),
-        ([np.ones(10), 5 * at.ones(101)], True),
-        ([np.ones(10), True, 5 * at.ones(101)], True),
-        ([np.array([1, 2, 3]), True, 5 * at.ones(101)], True),
+        ([pt.ones(10)], True),
+        ([pt.ones(10), 5 * pt.ones(101)], True),
+        ([np.ones(10), 5 * pt.ones(101)], True),
+        ([np.ones(10), True, 5 * pt.ones(101)], True),
+        ([np.array([1, 2, 3]), True, 5 * pt.ones(101)], True),
         ([False], False),
-        ([at.zeros(10)], False),
+        ([pt.zeros(10)], False),
         ([True, False], False),
-        ([np.array([0, -1]), at.ones(60)], False),
-        ([np.ones(10), False, 5 * at.ones(101)], False),
+        ([np.array([0, -1]), pt.ones(60)], False),
+        ([np.ones(10), False, 5 * pt.ones(101)], False),
     ],
 )
 def test_check_parameters(conditions, succeeds):
@@ -68,7 +68,7 @@ def test_check_parameters(conditions, succeeds):
 
 
 def test_check_parameters_shape():
-    conditions = [True, at.ones(10), at.ones(5)]
+    conditions = [True, pt.ones(10), pt.ones(5)]
     assert check_parameters(1, *conditions).eval().shape == ()
 
 
@@ -81,11 +81,11 @@ def dist(cls, n, p, *args, **kwargs):
 
     def logp(value, n, p):
         return check_parameters(
-            factln(n) - factln(value).sum() + (value * at.log(p)).sum(),
+            factln(n) - factln(value).sum() + (value * pt.log(p)).sum(),
             value >= 0,
             0 <= p,
             p <= 1,
-            at.isclose(p.sum(), 1),
+            pt.isclose(p.sum(), 1),
         )
 
 
@@ -98,11 +98,11 @@ def dist(cls, n, p, *args, **kwargs):
 
     def logp(value, n, p):
         return check_parameters(
-            factln(n) - factln(value).sum() + (value * at.log(p)).sum(),
-            at.all(value >= 0),
-            at.all(0 <= p),
-            at.all(p <= 1),
-            at.isclose(p.sum(), 1),
+            factln(n) - factln(value).sum() + (value * pt.log(p)).sum(),
+            pt.all(value >= 0),
+            pt.all(0 <= p),
+            pt.all(p <= 1),
+            pt.isclose(p.sum(), 1),
         )
 
 
@@ -129,10 +129,10 @@ def test_logp(self):
 
         chol_val = floatX(np.array([[1, 0.9], [0, 2]]))
         cov_val = floatX(np.dot(chol_val, chol_val.T))
-        cov = at.matrix("cov")
+        cov = pt.matrix("cov")
         cov.tag.test_value = cov_val
         delta_val = floatX(np.random.randn(5, 2))
-        delta = at.matrix("delta")
+        delta = pt.matrix("delta")
         delta.tag.test_value = delta_val
         expect = stats.multivariate_normal(mean=np.zeros(2), cov=cov_val)
         expect = expect.logpdf(delta_val).sum()
@@ -146,13 +146,13 @@ def test_grad(self):
         np.random.seed(42)
 
         def func(chol_vec, delta):
-            chol = at.stack(
+            chol = pt.stack(
                 [
-                    at.stack([at.exp(0.1 * chol_vec[0]), 0]),
-                    at.stack([chol_vec[1], 2 * at.exp(chol_vec[2])]),
+                    pt.stack([pt.exp(0.1 * chol_vec[0]), 0]),
+                    pt.stack([chol_vec[1], 2 * pt.exp(chol_vec[2])]),
                 ]
             )
-            cov = at.dot(chol, chol.T)
+            cov = pt.dot(chol, chol.T)
             return MvNormalLogp()(cov, delta)
 
         chol_vec_val = floatX(np.array([0.5, 1.0, -0.1]))
@@ -165,21 +165,21 @@ def func(chol_vec, delta):
 
     @pytensor.config.change_flags(compute_test_value="ignore")
     def test_hessian(self):
-        chol_vec = at.vector("chol_vec")
+        chol_vec = pt.vector("chol_vec")
         chol_vec.tag.test_value = floatX(np.array([0.1, 2, 3]))
-        chol = at.stack(
+        chol = pt.stack(
             [
-                at.stack([at.exp(0.1 * chol_vec[0]), 0]),
-                at.stack([chol_vec[1], 2 * at.exp(chol_vec[2])]),
+                pt.stack([pt.exp(0.1 * chol_vec[0]), 0]),
+                pt.stack([chol_vec[1], 2 * pt.exp(chol_vec[2])]),
             ]
         )
-        cov = at.dot(chol, chol.T)
-        delta = at.matrix("delta")
+        cov = pt.dot(chol, chol.T)
+        delta = pt.matrix("delta")
         delta.tag.test_value = floatX(np.ones((5, 2)))
         logp = MvNormalLogp()(cov, delta)
-        g_cov, g_delta = at.grad(logp, [cov, delta])
+        g_cov, g_delta = pt.grad(logp, [cov, delta])
         # TODO: What's the test?  Something needs to be asserted.
-        at.grad(g_delta.sum() + g_cov.sum(), [delta, cov])
+        pt.grad(g_delta.sum() + g_cov.sum(), [delta, cov])
 
 
 class TestSplineWrapper:
@@ -195,10 +195,10 @@ def test_hessian(self):
         x = np.linspace(0, 1, 100)
         y = x * x
         spline = SplineWrapper(interpolate.InterpolatedUnivariateSpline(x, y, k=1))
-        x_var = at.dscalar("x")
-        (g_x,) = at.grad(spline(x_var), [x_var])
+        x_var = pt.dscalar("x")
+        (g_x,) = pt.grad(spline(x_var), [x_var])
         with pytest.raises(NotImplementedError):
-            at.grad(g_x, [x_var])
+            pt.grad(g_x, [x_var])
 
 
 class TestI0e:
@@ -224,8 +224,8 @@ def check_vals(fn1, fn2, *args):
 
 
 def test_multigamma():
-    x = at.vector("x")
-    p = at.scalar("p")
+    x = pt.vector("x")
+    p = pt.scalar("p")
 
     xvals = [np.array([v], dtype=config.floatX) for v in [0.1, 2, 5, 10, 50, 100]]
 
@@ -243,4 +243,4 @@ def ref_multigammaln(a, b):
 def test_incomplete_beta_deprecation():
     with pytest.warns(FutureWarning, match="incomplete_beta has been deprecated"):
         res = incomplete_beta(3, 5, 0.5).eval()
-    assert np.isclose(res, at.betainc(3, 5, 0.5).eval())
+    assert np.isclose(res, pt.betainc(3, 5, 0.5).eval())
diff --git a/tests/distributions/test_distribution.py b/tests/distributions/test_distribution.py
index 7251676a2f..753d4d6487 100644
--- a/tests/distributions/test_distribution.py
+++ b/tests/distributions/test_distribution.py
@@ -13,11 +13,12 @@
 #   limitations under the License.
 import warnings
 
+import cloudpickle
 import numpy as np
 import numpy.random as npr
 import numpy.testing as npt
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.stats as st
 
@@ -46,12 +47,12 @@
 from pymc.distributions.shape_utils import change_dist_size, rv_size_is_none, to_tuple
 from pymc.distributions.transforms import log
 from pymc.exceptions import BlockModelAccessError
-from pymc.logprob.abstract import get_measurable_outputs, logcdf
-from pymc.logprob.joint_logprob import logp
-from pymc.model import Model
+from pymc.logprob.abstract import get_measurable_outputs
+from pymc.logprob.basic import logcdf, logp
+from pymc.model import Deterministic, Model
 from pymc.sampling import draw, sample
+from pymc.testing import assert_moment_is_expected
 from pymc.util import _FutureWarningValidatingScratchpad
-from tests.distributions.util import assert_moment_is_expected
 
 
 class TestBugfixes:
@@ -235,14 +236,16 @@ def random(rng, size):
         with Model():
             Normal("x")
             y = CustomDist("y", logp=func, random=random)
+            y_dist = CustomDist.dist(logp=func, random=random)
+            Deterministic("y_dist", y_dist)
             assert isinstance(y.owner.op, CustomDistRV)
+            assert isinstance(y_dist.owner.op, CustomDistRV)
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", ".*number of samples.*", UserWarning)
                 sample(draws=5, tune=1, mp_ctx="spawn")
 
-        import cloudpickle
-
         cloudpickle.loads(cloudpickle.dumps(y))
+        cloudpickle.loads(cloudpickle.dumps(y_dist))
 
     def test_custom_dist_old_api_error(self):
         with Model():
@@ -257,7 +260,7 @@ def test_custom_dist_multivariate_logp(self, size):
         with Model() as model:
 
             def logp(value, mu):
-                return pm.MvNormal.logp(value, mu, at.eye(mu.shape[0]))
+                return pm.MvNormal.logp(value, mu, pt.eye(mu.shape[0]))
 
             mu = Normal("mu", size=supp_shape)
             a = CustomDist("a", mu, logp=logp, ndims_params=[1], ndim_supp=1, size=size)
@@ -281,7 +284,7 @@ def logp(value, mu):
     )
     def test_custom_dist_default_moment_univariate(self, moment, size, expected):
         if moment == "custom_moment":
-            moment = lambda rv, size, *rv_inputs: 5 * at.ones(size, dtype=rv.dtype)
+            moment = lambda rv, size, *rv_inputs: 5 * pt.ones(size, dtype=rv.dtype)
         with pm.Model() as model:
             x = CustomDist("x", moment=moment, size=size)
         assert isinstance(x.owner.op, CustomDistRV)
@@ -290,7 +293,7 @@ def test_custom_dist_default_moment_univariate(self, moment, size, expected):
     @pytest.mark.parametrize("size", [(), (2,), (3, 2)], ids=str)
     def test_custom_dist_custom_moment_univariate(self, size):
         def density_moment(rv, size, mu):
-            return (at.ones(size) * mu).astype(rv.dtype)
+            return (pt.ones(size) * mu).astype(rv.dtype)
 
         mu_val = np.array(np.random.normal(loc=2, scale=1)).astype(pytensor.config.floatX)
         with Model():
@@ -304,7 +307,7 @@ def density_moment(rv, size, mu):
     @pytest.mark.parametrize("size", [(), (2,), (3, 2)], ids=str)
     def test_custom_dist_custom_moment_multivariate(self, size):
         def density_moment(rv, size, mu):
-            return (at.ones(size)[..., None] * mu).astype(rv.dtype)
+            return (pt.ones(size)[..., None] * mu).astype(rv.dtype)
 
         mu_val = np.random.normal(loc=2, scale=1, size=5).astype(pytensor.config.floatX)
         with Model():
@@ -354,12 +357,13 @@ def test_dist(self):
         mu = 1
         x = pm.CustomDist.dist(
             mu,
-            class_name="test",
             logp=lambda value, mu: pm.logp(pm.Normal.dist(mu), value),
             random=lambda mu, rng=None, size=None: rng.normal(loc=mu, scale=1, size=size),
             shape=(3,),
         )
 
+        x = cloudpickle.loads(cloudpickle.dumps(x))
+
         test_value = pm.draw(x, random_seed=1)
         assert np.all(test_value == pm.draw(x, random_seed=1))
 
@@ -370,21 +374,20 @@ def test_dist(self):
 class TestCustomSymbolicDist:
     def test_basic(self):
         def custom_dist(mu, sigma, size):
-            return at.exp(pm.Normal.dist(mu, sigma, size=size))
+            return pt.exp(pm.Normal.dist(mu, sigma, size=size))
 
         with Model() as m:
             mu = Normal("mu")
             sigma = HalfNormal("sigma")
-            with pytest.warns(UserWarning, match="experimental"):
-                lognormal = CustomDist(
-                    "lognormal",
-                    mu,
-                    sigma,
-                    dist=custom_dist,
-                    size=(10,),
-                    transform=log,
-                    initval=np.ones(10),
-                )
+            lognormal = CustomDist(
+                "lognormal",
+                mu,
+                sigma,
+                dist=custom_dist,
+                size=(10,),
+                transform=log,
+                initval=np.ones(10),
+            )
 
         assert isinstance(lognormal.owner.op, CustomSymbolicDistRV)
 
@@ -401,20 +404,34 @@ def custom_dist(mu, sigma, size):
         ip = m.initial_point()
         np.testing.assert_allclose(m.compile_logp()(ip), ref_m.compile_logp()(ip))
 
+    def test_logcdf_inference(self):
+        def custom_dist(mu, sigma, size):
+            return pt.exp(pm.Normal.dist(mu, sigma, size=size))
+
+        mu = 1
+        sigma = 1.25
+        test_value = 0.9
+
+        custom_lognormal = CustomDist.dist(mu, sigma, dist=custom_dist)
+        ref_lognormal = LogNormal.dist(mu, sigma)
+
+        np.testing.assert_allclose(
+            pm.logcdf(custom_lognormal, test_value).eval(),
+            pm.logcdf(ref_lognormal, test_value).eval(),
+        )
+
     def test_random_multiple_rngs(self):
         def custom_dist(p, sigma, size):
             idx = pm.Bernoulli.dist(p=p)
             comps = pm.Normal.dist([-sigma, sigma], 1e-1, size=(*size, 2)).T
             return comps[idx]
 
-        with pytest.warns(UserWarning, match="experimental"):
-            customdist = CustomDist.dist(
-                0.5,
-                10.0,
-                class_name="customdist",
-                dist=custom_dist,
-                size=(10,),
-            )
+        customdist = CustomDist.dist(
+            0.5,
+            10.0,
+            dist=custom_dist,
+            size=(10,),
+        )
 
         assert isinstance(customdist.owner.op, CustomSymbolicDistRV)
 
@@ -430,26 +447,24 @@ def test_custom_methods(self):
         def custom_dist(mu, size):
             if rv_size_is_none(size):
                 return mu
-            return at.full(size, mu)
+            return pt.full(size, mu)
 
         def custom_moment(rv, size, mu):
-            return at.full_like(rv, mu + 1)
+            return pt.full_like(rv, mu + 1)
 
         def custom_logp(value, mu):
-            return at.full_like(value, mu + 2)
+            return pt.full_like(value, mu + 2)
 
         def custom_logcdf(value, mu):
-            return at.full_like(value, mu + 3)
-
-        with pytest.warns(UserWarning, match="experimental"):
-            customdist = CustomDist.dist(
-                [np.e, np.e],
-                class_name="customdist",
-                dist=custom_dist,
-                moment=custom_moment,
-                logp=custom_logp,
-                logcdf=custom_logcdf,
-            )
+            return pt.full_like(value, mu + 3)
+
+        customdist = CustomDist.dist(
+            [np.e, np.e],
+            dist=custom_dist,
+            moment=custom_moment,
+            logp=custom_logp,
+            logcdf=custom_logcdf,
+        )
 
         assert isinstance(customdist.owner.op, CustomSymbolicDistRV)
 
@@ -460,16 +475,14 @@ def custom_logcdf(value, mu):
 
     def test_change_size(self):
         def custom_dist(mu, sigma, size):
-            return at.exp(pm.Normal.dist(mu, sigma, size=size))
+            return pt.exp(pm.Normal.dist(mu, sigma, size=size))
 
-        with pytest.warns(UserWarning, match="experimental"):
-            lognormal = CustomDist.dist(
-                0,
-                1,
-                class_name="lognormal",
-                dist=custom_dist,
-                size=(10,),
-            )
+        lognormal = CustomDist.dist(
+            0,
+            1,
+            dist=custom_dist,
+            size=(10,),
+        )
         assert isinstance(lognormal.owner.op, CustomSymbolicDistRV)
         assert tuple(lognormal.shape.eval()) == (10,)
 
diff --git a/tests/distributions/test_mixture.py b/tests/distributions/test_mixture.py
index dc6a4c7446..df41f8b071 100644
--- a/tests/distributions/test_mixture.py
+++ b/tests/distributions/test_mixture.py
@@ -22,7 +22,7 @@
 import scipy.stats as st
 
 from numpy.testing import assert_allclose
-from pytensor import tensor as at
+from pytensor import tensor as pt
 from pytensor.tensor import TensorVariable
 from pytensor.tensor.random.op import RandomVariable
 from scipy.special import logsumexp
@@ -50,7 +50,7 @@
 from pymc.distributions.mixture import MixtureTransformWarning
 from pymc.distributions.shape_utils import change_dist_size, to_tuple
 from pymc.distributions.transforms import _default_transform
-from pymc.logprob.joint_logprob import logp
+from pymc.logprob.basic import logp
 from pymc.logprob.transforms import IntervalTransform, LogTransform, SimplexTransform
 from pymc.math import expand_packed_triangular
 from pymc.model import Model
@@ -62,13 +62,13 @@
 )
 from pymc.sampling.mcmc import sample
 from pymc.step_methods import Metropolis
-from tests.distributions.util import (
+from pymc.testing import (
     Domain,
+    SeededTest,
     Simplex,
     assert_moment_is_expected,
-    pymc_random,
+    continuous_random_tester,
 )
-from tests.helpers import SeededTest
 
 
 def generate_normal_mixture_data(w, mu, sigma, size=1000):
@@ -850,7 +850,7 @@ def ref_rand(size, w, mu, sigma):
             component = np.random.choice(w.size, size=size, p=w)
             return np.random.normal(mu[component], sigma[component], size=size)
 
-        pymc_random(
+        continuous_random_tester(
             NormalMixture,
             {
                 "w": Simplex(2),
@@ -861,7 +861,7 @@ def ref_rand(size, w, mu, sigma):
             size=1000,
             ref_rand=ref_rand,
         )
-        pymc_random(
+        continuous_random_tester(
             NormalMixture,
             {
                 "w": Simplex(3),
@@ -881,7 +881,7 @@ def test_scalar_components(self):
         nd = 3
         npop = 4
         # [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]
-        mus = at.constant(np.full((nd, npop), np.arange(npop)))
+        mus = pt.constant(np.full((nd, npop), np.arange(npop)))
 
         with Model() as model:
             m = NormalMixture(
@@ -893,7 +893,7 @@ def test_scalar_components(self):
                 shape=nd,
             )
             z = Categorical("z", p=np.ones(npop) / npop, shape=nd)
-            mu = at.as_tensor_variable([mus[i, z[i]] for i in range(nd)])
+            mu = pt.as_tensor_variable([mus[i, z[i]] for i in range(nd)])
             latent_m = Normal("latent_m", mu=mu, sigma=1e-5, shape=nd)
 
         size = 100
@@ -916,7 +916,7 @@ def test_vector_components(self):
         nd = 3
         npop = 4
         # [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]
-        mus = at.constant(np.full((nd, npop), np.arange(npop)))
+        mus = pt.constant(np.full((nd, npop), np.arange(npop)))
 
         with Model() as model:
             m = Mixture(
@@ -1299,13 +1299,13 @@ def test_hierarchical_interval_transform(self):
         with Model() as model:
             lower = Normal("lower", 0.5)
             upper = Uniform("upper", 0, 1)
-            uniform = Uniform("uniform", -at.abs(lower), at.abs(upper), transform=None)
+            uniform = Uniform("uniform", -pt.abs(lower), pt.abs(upper), transform=None)
             triangular = Triangular(
-                "triangular", -at.abs(lower), at.abs(upper), c=0.25, transform=None
+                "triangular", -pt.abs(lower), pt.abs(upper), c=0.25, transform=None
             )
             comp_dists = [
-                Uniform.dist(-at.abs(lower), at.abs(upper)),
-                Triangular.dist(-at.abs(lower), at.abs(upper), c=0.25),
+                Uniform.dist(-pt.abs(lower), pt.abs(upper)),
+                Triangular.dist(-pt.abs(lower), pt.abs(upper), c=0.25),
             ]
             mix1 = Mixture("mix1", [0.3, 0.7], comp_dists)
             mix2 = Mixture("mix2", [0.3, 0.7][::-1], comp_dists[::-1])
diff --git a/tests/distributions/test_multivariate.py b/tests/distributions/test_multivariate.py
index 5e5385a145..671fe4fa79 100644
--- a/tests/distributions/test_multivariate.py
+++ b/tests/distributions/test_multivariate.py
@@ -20,7 +20,7 @@
 import numpy.random as npr
 import numpy.testing as npt
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.special as sp
 import scipy.stats as st
@@ -37,12 +37,12 @@
     quaddist_matrix,
 )
 from pymc.distributions.shape_utils import change_dist_size, to_tuple
-from pymc.logprob.joint_logprob import logp
+from pymc.logprob.basic import logp
 from pymc.logprob.utils import ParameterValueError
 from pymc.math import kronecker
 from pymc.pytensorf import compile_pymc, floatX, intX
 from pymc.sampling.forward import draw
-from tests.distributions.util import (
+from pymc.testing import (
     BaseTestDistributionRandom,
     Domain,
     Nat,
@@ -54,10 +54,10 @@
     Vector,
     assert_moment_is_expected,
     check_logp,
-    pymc_random,
+    continuous_random_tester,
     seeded_numpy_distribution_builder,
+    select_by_precision,
 )
-from tests.helpers import select_by_precision
 
 
 def betafn(a):
@@ -159,9 +159,9 @@ def mvt_logpdf(value, nu, Sigma, mu=0):
 
 @pytest.fixture(scope="module")
 def stickbreakingweights_logpdf():
-    _value = at.vector()
-    _alpha = at.scalar()
-    _k = at.iscalar()
+    _value = pt.vector()
+    _alpha = pt.scalar()
+    _k = pt.iscalar()
     _logp = logp(pm.StickBreakingWeights.dist(_alpha, _k), _value)
     core_fn = compile_pymc([_value, _alpha, _k], _logp)
 
@@ -297,16 +297,16 @@ def test_mvnormal(self, n):
     )
     def test_mvnormal_indef(self):
         cov_val = np.array([[1, 0.5], [0.5, -2]])
-        cov = at.matrix("cov")
+        cov = pt.matrix("cov")
         cov.tag.test_value = np.eye(2)
         mu = floatX(np.zeros(2))
-        x = at.vector("x")
+        x = pt.vector("x")
         x.tag.test_value = np.zeros(2)
         mvn_logp = logp(pm.MvNormal.dist(mu=mu, cov=cov), x)
         f_logp = pytensor.function([cov, x], mvn_logp)
         with pytest.raises(ParameterValueError):
             f_logp(cov_val, np.ones(2))
-        dlogp = at.grad(mvn_logp, cov)
+        dlogp = pt.grad(mvn_logp, cov)
         f_dlogp = pytensor.function([cov, x], dlogp)
         assert not np.all(np.isfinite(f_dlogp(cov_val, np.ones(2))))
 
@@ -314,7 +314,7 @@ def test_mvnormal_indef(self):
         f_logp = pytensor.function([cov, x], mvn_logp)
         with pytest.raises(ParameterValueError):
             f_logp(cov_val, np.ones(2))
-        dlogp = at.grad(mvn_logp, cov)
+        dlogp = pt.grad(mvn_logp, cov)
         f_dlogp = pytensor.function([cov, x], dlogp)
         assert not np.all(np.isfinite(f_dlogp(cov_val, np.ones(2))))
 
@@ -482,9 +482,11 @@ def test_lkjcorr(self, x, eta, n, lp):
         with pm.Model() as model:
             pm.LKJCorr("lkj", eta=eta, n=n, transform=None)
 
-        pt = {"lkj": x}
+        point = {"lkj": x}
         decimals = select_by_precision(float64=6, float32=4)
-        npt.assert_almost_equal(model.compile_logp()(pt), lp, decimal=decimals, err_msg=str(pt))
+        npt.assert_almost_equal(
+            model.compile_logp()(point), lp, decimal=decimals, err_msg=str(point)
+        )
 
     @pytest.mark.parametrize("n", [1, 2, 3])
     def test_dirichlet(self, n):
@@ -566,7 +568,7 @@ def test_multinomial_negative_p_symbolic(self):
         # logp raises a ParameterValueError
         value = np.array([[1, 1, 1]])
 
-        x = at.scalar("x")
+        x = pt.scalar("x")
         invalid_dist = pm.Multinomial.dist(n=1, p=[x, x, x])
 
         with pytest.raises(ParameterValueError):
@@ -577,7 +579,7 @@ def test_multinomial_p_not_normalized_symbolic(self):
         # logp raises a ParameterValueError
         value = np.array([[1, 1, 1]])
 
-        x = at.scalar("x")
+        x = pt.scalar("x")
         invalid_dist = pm.Multinomial.dist(n=1, p=(x, x, x))
         with pytest.raises(ParameterValueError):
             pm.logp(invalid_dist, value).eval({x: 0.5})
@@ -710,12 +712,12 @@ def test_dirichlet_multinomial_vectorized(self, n, a, extra_size):
     def test_stickbreakingweights_logp(self, value, alpha, K, logp):
         with pm.Model() as model:
             sbw = pm.StickBreakingWeights("sbw", alpha=alpha, K=K, transform=None)
-        pt = {"sbw": value}
+        point = {"sbw": value}
         npt.assert_almost_equal(
             pm.logp(sbw, value).eval(),
             logp,
             decimal=select_by_precision(float64=6, float32=2),
-            err_msg=str(pt),
+            err_msg=str(point),
         )
 
     def test_stickbreakingweights_invalid(self):
@@ -737,12 +739,12 @@ def test_stickbreakingweights_vectorized(self, alpha, K, stickbreakingweights_lo
         value = pm.StickBreakingWeights.dist(alpha, K).eval()
         with pm.Model():
             sbw = pm.StickBreakingWeights("sbw", alpha=alpha, K=K, transform=None)
-        pt = {"sbw": value}
+        point = {"sbw": value}
         npt.assert_almost_equal(
             pm.logp(sbw, value).eval(),
             stickbreakingweights_logpdf(value, alpha, K),
             decimal=select_by_precision(float64=6, float32=2),
-            err_msg=str(pt),
+            err_msg=str(point),
         )
 
     @pytest.mark.parametrize(
@@ -842,7 +844,7 @@ def test_dist(self):
 
     def test_sd_dist_distribution(self):
         with pm.Model() as m:
-            sd_dist = at.constant([1, 2, 3])
+            sd_dist = pt.constant([1, 2, 3])
             with pytest.raises(TypeError, match="^sd_dist must be a scalar or vector distribution"):
                 x = pm.LKJCholeskyCov("x", n=3, eta=1, sd_dist=sd_dist)
 
@@ -2010,7 +2012,7 @@ def ref_rand(size, n, eta):
             beta = eta - 1 + n / 2
             return (st.beta.rvs(size=(size, shape), a=beta, b=beta) - 0.5) * 2
 
-        pymc_random(
+        continuous_random_tester(
             pm.LKJCorr,
             {
                 "n": Domain([2, 10, 50], edges=(None, None)),
diff --git a/tests/distributions/test_shape_utils.py b/tests/distributions/test_shape_utils.py
index 6458ccb420..2c2598145b 100644
--- a/tests/distributions/test_shape_utils.py
+++ b/tests/distributions/test_shape_utils.py
@@ -18,7 +18,7 @@
 import pytensor
 import pytest
 
-from pytensor import tensor as at
+from pytensor import tensor as pt
 from pytensor.compile.mode import Mode
 from pytensor.graph import Constant, ancestors
 from pytensor.tensor import TensorVariable
@@ -30,17 +30,13 @@
 from pymc import ShapeError
 from pymc.distributions.shape_utils import (
     broadcast_dist_samples_shape,
-    broadcast_dist_samples_to,
-    broadcast_distribution_samples,
     change_dist_size,
     convert_dims,
     convert_shape,
     convert_size,
-    get_broadcastable_dist_samples,
     get_support_shape,
     get_support_shape_1d,
     rv_size_is_none,
-    shapes_broadcasting,
     to_tuple,
 )
 from pymc.model import Model
@@ -90,67 +86,18 @@ def fixture_exception_handling(request):
     return request.param
 
 
-@pytest.fixture()
-def samples_to_broadcast(fixture_sizes, fixture_shapes):
-    samples = [np.empty(s) for s in fixture_shapes]
-    try:
-        broadcast_shape = broadcast_dist_samples_shape(fixture_shapes, size=fixture_sizes)
-    except ValueError:
-        broadcast_shape = None
-    return fixture_sizes, samples, broadcast_shape
-
-
-@pytest.fixture(params=test_to_shapes, ids=str)
-def samples_to_broadcast_to(request, samples_to_broadcast):
-    to_shape = request.param
-    size, samples, broadcast_shape = samples_to_broadcast
-    if broadcast_shape is not None:
-        try:
-            broadcast_shape = broadcast_dist_samples_shape(
-                [broadcast_shape, to_tuple(to_shape)], size=size
-            )
-        except ValueError:
-            broadcast_shape = None
-    return to_shape, size, samples, broadcast_shape
-
-
 class TestShapesBroadcasting:
-    @pytest.mark.parametrize(
-        "bad_input",
-        [None, [None], "asd", 3.6, {1: 2}, {3}, [8, [8]], "3", ["3"], np.array([[2]])],
-        ids=str,
-    )
-    def test_type_check_raises(self, bad_input):
-        with warnings.catch_warnings():
-            warnings.filterwarnings(
-                "ignore", ".*ragged nested sequences.*", np.VisibleDeprecationWarning
-            )
-            with pytest.raises(TypeError):
-                shapes_broadcasting(bad_input, tuple(), raise_exception=True)
-            with pytest.raises(TypeError):
-                shapes_broadcasting(bad_input, tuple(), raise_exception=False)
-
-    def test_type_check_success(self):
-        inputs = [3, 3.0, tuple(), [3], (3,), np.array(3), np.array([3])]
-        out = shapes_broadcasting(*inputs)
-        assert out == (3,)
-
-    def test_broadcasting(self, fixture_shapes, fixture_exception_handling):
+    def test_broadcasting(self, fixture_shapes):
         shapes = fixture_shapes
-        raise_exception = fixture_exception_handling
         try:
             expected_out = np.broadcast(*(np.empty(s) for s in shapes)).shape
         except ValueError:
             expected_out = None
         if expected_out is None:
-            if raise_exception:
-                with pytest.raises(ValueError):
-                    shapes_broadcasting(*shapes, raise_exception=raise_exception)
-            else:
-                out = shapes_broadcasting(*shapes, raise_exception=raise_exception)
-                assert out is None
+            with pytest.raises(ValueError):
+                np.broadcast_shapes(*shapes)
         else:
-            out = shapes_broadcasting(*shapes, raise_exception=raise_exception)
+            out = np.broadcast_shapes(*shapes)
             assert out == expected_out
 
     def test_broadcast_dist_samples_shape(self, fixture_sizes, fixture_shapes):
@@ -176,48 +123,6 @@ def test_broadcast_dist_samples_shape(self, fixture_sizes, fixture_shapes):
             assert out == expected_out
 
 
-class TestSamplesBroadcasting:
-    def test_broadcast_distribution_samples(self, samples_to_broadcast):
-        size, samples, broadcast_shape = samples_to_broadcast
-        if broadcast_shape is not None:
-            outs = broadcast_distribution_samples(samples, size=size)
-            assert all(o.shape == broadcast_shape for o in outs)
-        else:
-            with pytest.raises(ValueError):
-                broadcast_distribution_samples(samples, size=size)
-
-    def test_get_broadcastable_dist_samples(self, samples_to_broadcast):
-        size, samples, broadcast_shape = samples_to_broadcast
-        if broadcast_shape is not None:
-            size_ = to_tuple(size)
-            outs, out_shape = get_broadcastable_dist_samples(
-                samples, size=size, return_out_shape=True
-            )
-            assert out_shape == broadcast_shape
-            for i, o in zip(samples, outs):
-                ishape = i.shape
-                if ishape[: min([len(size_), len(ishape)])] == size_:
-                    expected_shape = (
-                        size_ + (1,) * (len(broadcast_shape) - len(ishape)) + ishape[len(size_) :]
-                    )
-                else:
-                    expected_shape = ishape
-                assert o.shape == expected_shape
-            assert shapes_broadcasting(*(o.shape for o in outs)) == broadcast_shape
-        else:
-            with pytest.raises(ValueError):
-                get_broadcastable_dist_samples(samples, size=size)
-
-    def test_broadcast_dist_samples_to(self, samples_to_broadcast_to):
-        to_shape, size, samples, broadcast_shape = samples_to_broadcast_to
-        if broadcast_shape is not None:
-            outs = broadcast_dist_samples_to(to_shape, samples, size=size)
-            assert all(o.shape == broadcast_shape for o in outs)
-        else:
-            with pytest.raises(ValueError):
-                broadcast_dist_samples_to(to_shape, samples, size=size)
-
-
 class TestSizeShapeDimsObserved:
     @pytest.mark.parametrize("param_shape", [(), (2,)])
     @pytest.mark.parametrize("batch_shape", [(), (3,)])
@@ -299,7 +204,7 @@ def test_simultaneous_size_and_dims(self):
             assert "ddata" in pmodel.dim_lengths
 
             # Size does not include support dims, so this test must use a dist with support dims.
-            kwargs = dict(name="y", size=(2, 3), mu=at.ones((3, 4)), cov=at.eye(4))
+            kwargs = dict(name="y", size=(2, 3), mu=pt.ones((3, 4)), cov=pt.eye(4))
             y = pm.MvNormal(**kwargs, dims=("dsize", "ddata", "dsupport"))
             assert pmodel.named_vars_to_dims["y"] == ("dsize", "ddata", "dsupport")
 
@@ -343,7 +248,7 @@ def test_can_resize_data_defined_size(self):
             assert z.eval().shape == (3, 2)
 
     def test_size32_doesnt_break_broadcasting(self):
-        size32 = at.constant([1, 10], dtype="int32")
+        size32 = pt.constant([1, 10], dtype="int32")
         rv = pm.Normal.dist(0, 1, size=size32)
         assert rv.broadcastable == (True, False)
 
@@ -355,15 +260,15 @@ def test_observed_with_column_vector(self):
         with pm.Model() as model:
             # The `observed` is a broadcastable column vector
             obs = [
-                at.as_tensor_variable(np.ones((3, 1), dtype=pytensor.config.floatX))
+                pt.as_tensor_variable(np.ones((3, 1), dtype=pytensor.config.floatX))
                 for _ in range(4)
             ]
             assert all(obs_.broadcastable == (False, True) for obs_ in obs)
 
             # Both shapes describe broadcastable volumn vectors
-            size64 = at.constant([3, 1], dtype="int64")
+            size64 = pt.constant([3, 1], dtype="int64")
             # But the second shape is upcasted from an int32 vector
-            cast64 = at.cast(at.constant([3, 1], dtype="int32"), dtype="int64")
+            cast64 = pt.cast(pt.constant([3, 1], dtype="int32"), dtype="int64")
 
             pm.Normal("size64", mu=0, sigma=1, size=size64, observed=obs[0])
             pm.Normal("shape64", mu=0, sigma=1, shape=size64, observed=obs[1])
@@ -494,7 +399,7 @@ def test_rv_size_is_none():
 
 
 def test_change_rv_size():
-    loc = at.as_tensor_variable([1, 2])
+    loc = pt.as_tensor_variable([1, 2])
     rng = pytensor.shared(np.random.default_rng())
     rv = normal(loc=loc, rng=rng)
     assert rv.ndim == 1
@@ -503,7 +408,7 @@ def test_change_rv_size():
     with pytest.raises(ShapeError, match="must be ≤1-dimensional"):
         change_dist_size(rv, new_size=[[2, 3]])
     with pytest.raises(ShapeError, match="must be ≤1-dimensional"):
-        change_dist_size(rv, new_size=at.as_tensor_variable([[2, 3], [4, 5]]))
+        change_dist_size(rv, new_size=pt.as_tensor_variable([[2, 3], [4, 5]]))
 
     rv_new = change_dist_size(rv, new_size=(3,), expand=True)
     assert rv_new.ndim == 2
@@ -532,13 +437,13 @@ def test_change_rv_size():
     assert tuple(rv_newer.shape.eval()) == (4, 3)
 
     rv = normal(0, 1)
-    new_size = at.as_tensor(np.array([4, 3], dtype="int32"))
+    new_size = pt.as_tensor(np.array([4, 3], dtype="int32"))
     rv_newer = change_dist_size(rv, new_size=new_size, expand=True)
     assert rv_newer.ndim == 2
     assert tuple(rv_newer.shape.eval()) == (4, 3)
 
     rv = normal(0, 1)
-    new_size = at.as_tensor(2, dtype="int32")
+    new_size = pt.as_tensor(2, dtype="int32")
     rv_newer = change_dist_size(rv, new_size=new_size, expand=True)
     assert rv_newer.ndim == 1
     assert tuple(rv_newer.shape.eval()) == (2,)
@@ -573,9 +478,9 @@ def test_change_rv_size_default_update():
 
 def test_change_specify_shape_size_univariate():
     with pytensor.config.change_flags(mode=Mode("py")):
-        s1, s2 = at.iscalars("s1", "s2")
-        x = at.random.normal(size=(s1, s2))
-        x = at.specify_shape(x, (5, 3))
+        s1, s2 = pt.iscalars("s1", "s2")
+        x = pt.random.normal(size=(s1, s2))
+        x = pt.specify_shape(x, (5, 3))
         x.eval({s1: 5, s2: 3}).shape == (5, 3)
 
         new_x = change_dist_size(x, (10, 5))
@@ -593,9 +498,9 @@ def test_change_specify_shape_size_univariate():
 
 def test_change_specify_shape_size_multivariate():
     with pytensor.config.change_flags(mode=Mode("py")):
-        batch, supp = at.iscalars("batch", "supp")
-        x = at.random.multivariate_normal(at.zeros(supp), at.eye(supp), size=(batch,))
-        x = at.specify_shape(x, (5, 3))
+        batch, supp = pt.iscalars("batch", "supp")
+        x = pt.random.multivariate_normal(pt.zeros(supp), pt.eye(supp), size=(batch,))
+        x = pt.specify_shape(x, (5, 3))
         x.eval({batch: 5, supp: 3}).shape == (5, 3)
 
         new_x = change_dist_size(x, (10, 5))
diff --git a/tests/distributions/test_simulator.py b/tests/distributions/test_simulator.py
index 7e06d2eb0f..0e3b86bd44 100644
--- a/tests/distributions/test_simulator.py
+++ b/tests/distributions/test_simulator.py
@@ -13,6 +13,7 @@
 #   limitations under the License.
 import warnings
 
+import cloudpickle
 import numpy as np
 import pytensor
 import pytest
@@ -32,7 +33,7 @@
 from pymc.initial_point import make_initial_point_fn
 from pymc.pytensorf import compile_pymc
 from pymc.smc.kernels import IMH
-from tests.helpers import SeededTest
+from pymc.testing import SeededTest
 
 
 class TestSimulator(SeededTest):
@@ -357,9 +358,10 @@ def normal_sim(rng, mu, sigma, size):
         assert np.all(np.abs((result - expected_sample_mean) / expected_sample_mean_std) < cutoff)
 
     def test_dist(self):
-        x = pm.Simulator.dist(self.normal_sim, 0, 1, sum_stat="sort", shape=(3,), class_name="test")
-        x_logp = pm.logp(x, [0, 1, 2])
+        x = pm.Simulator.dist(self.normal_sim, 0, 1, sum_stat="sort", shape=(3,))
+        x = cloudpickle.loads(cloudpickle.dumps(x))
 
+        x_logp = pm.logp(x, [0, 1, 2])
         x_logp_fn = compile_pymc([], x_logp, random_seed=1)
         res1, res2 = x_logp_fn(), x_logp_fn()
         assert res1.shape == (3,)
diff --git a/tests/distributions/test_timeseries.py b/tests/distributions/test_timeseries.py
index d49268e1a7..4e19b0bc0c 100644
--- a/tests/distributions/test_timeseries.py
+++ b/tests/distributions/test_timeseries.py
@@ -39,13 +39,12 @@
     MvStudentTRandomWalk,
     RandomWalk,
 )
-from pymc.logprob.joint_logprob import logp
+from pymc.logprob.basic import logp
 from pymc.model import Model
 from pymc.pytensorf import floatX
 from pymc.sampling.forward import draw, sample_posterior_predictive
 from pymc.sampling.mcmc import sample
-from tests.distributions.util import assert_moment_is_expected
-from tests.helpers import select_by_precision
+from pymc.testing import assert_moment_is_expected, select_by_precision
 
 # Turn all warnings into errors for this module
 # Ignoring NumPy deprecation warning tracked in https://github.com/pymc-devs/pytensor/issues/146
diff --git a/tests/distributions/test_transform.py b/tests/distributions/test_transform.py
index 9d47a59e10..a29ab16679 100644
--- a/tests/distributions/test_transform.py
+++ b/tests/distributions/test_transform.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from pytensor.tensor.var import TensorConstant
@@ -25,29 +25,29 @@
 import pymc as pm
 import pymc.distributions.transforms as tr
 
-from pymc.logprob.joint_logprob import joint_logp
+from pymc.logprob.basic import joint_logp
 from pymc.pytensorf import floatX, jacobian
-from tests.checks import close_to, close_to_logical
-from tests.distributions.util import (
+from pymc.testing import (
     Circ,
     MultiSimplex,
     R,
     Rminusbig,
     Rplusbig,
+    SeededTest,
     Simplex,
     SortedVector,
     Unit,
     UnitSortedVector,
     Vector,
 )
-from tests.helpers import SeededTest
+from tests.checks import close_to, close_to_logical
 
 # some transforms (stick breaking) require addition of small slack in order to be numerically
 # stable. The minimal addable slack for float32 is higher thus we need to be less strict
 tol = 1e-7 if pytensor.config.floatX == "float64" else 1e-6
 
 
-def check_transform(transform, domain, constructor=at.dscalar, test=0, rv_var=None):
+def check_transform(transform, domain, constructor=pt.dscalar, test=0, rv_var=None):
     x = constructor("x")
     x.tag.test_value = test
     if rv_var is None:
@@ -65,10 +65,10 @@ def check_transform(transform, domain, constructor=at.dscalar, test=0, rv_var=No
 
 
 def check_vector_transform(transform, domain, rv_var=None):
-    return check_transform(transform, domain, at.dvector, test=np.array([0, 0]), rv_var=rv_var)
+    return check_transform(transform, domain, pt.dvector, test=np.array([0, 0]), rv_var=rv_var)
 
 
-def get_values(transform, domain=R, constructor=at.dscalar, test=0, rv_var=None):
+def get_values(transform, domain=R, constructor=pt.dscalar, test=0, rv_var=None):
     x = constructor("x")
     x.tag.test_value = test
     if rv_var is None:
@@ -81,7 +81,7 @@ def get_values(transform, domain=R, constructor=at.dscalar, test=0, rv_var=None)
 def check_jacobian_det(
     transform,
     domain,
-    constructor=at.dscalar,
+    constructor=pt.dscalar,
     test=0,
     make_comparable=None,
     elemwise=False,
@@ -100,15 +100,15 @@ def check_jacobian_det(
         x = make_comparable(x)
 
     if not elemwise:
-        jac = at.log(at.nlinalg.det(jacobian(x, [y])))
+        jac = pt.log(pt.nlinalg.det(jacobian(x, [y])))
     else:
-        jac = at.log(at.abs(at.diag(jacobian(x, [y]))))
+        jac = pt.log(pt.abs(pt.diag(jacobian(x, [y]))))
 
     # ljd = log jacobian det
     actual_ljd = pytensor.function([y], jac)
 
     computed_ljd = pytensor.function(
-        [y], at.as_tensor_variable(transform.log_jac_det(y, *rv_inputs)), on_unused_input="ignore"
+        [y], pt.as_tensor_variable(transform.log_jac_det(y, *rv_inputs)), on_unused_input="ignore"
     )
 
     for yval in domain.vals:
@@ -119,22 +119,22 @@ def test_simplex():
     check_vector_transform(tr.simplex, Simplex(2))
     check_vector_transform(tr.simplex, Simplex(4))
 
-    check_transform(tr.simplex, MultiSimplex(3, 2), constructor=at.dmatrix, test=np.zeros((2, 2)))
+    check_transform(tr.simplex, MultiSimplex(3, 2), constructor=pt.dmatrix, test=np.zeros((2, 2)))
 
 
 def test_simplex_bounds():
-    vals = get_values(tr.simplex, Vector(R, 2), at.dvector, np.array([0, 0]))
+    vals = get_values(tr.simplex, Vector(R, 2), pt.dvector, np.array([0, 0]))
 
     close_to(vals.sum(axis=1), 1, tol)
     close_to_logical(vals > 0, True, tol)
     close_to_logical(vals < 1, True, tol)
 
-    check_jacobian_det(tr.simplex, Vector(R, 2), at.dvector, np.array([0, 0]), lambda x: x[:-1])
+    check_jacobian_det(tr.simplex, Vector(R, 2), pt.dvector, np.array([0, 0]), lambda x: x[:-1])
 
 
 def test_simplex_accuracy():
     val = np.array([-30])
-    x = at.dvector("x")
+    x = pt.dvector("x")
     x.tag.test_value = val
     identity_f = pytensor.function([x], tr.simplex.forward(x, tr.simplex.backward(x, x)))
     close_to(val, identity_f(val), tol)
@@ -148,10 +148,10 @@ def test_sum_to_1():
         tr.SumTo1(2)
 
     check_jacobian_det(
-        tr.univariate_sum_to_1, Vector(Unit, 2), at.dvector, np.array([0, 0]), lambda x: x[:-1]
+        tr.univariate_sum_to_1, Vector(Unit, 2), pt.dvector, np.array([0, 0]), lambda x: x[:-1]
     )
     check_jacobian_det(
-        tr.multivariate_sum_to_1, Vector(Unit, 2), at.dvector, np.array([0, 0]), lambda x: x[:-1]
+        tr.multivariate_sum_to_1, Vector(Unit, 2), pt.dvector, np.array([0, 0]), lambda x: x[:-1]
     )
 
 
@@ -159,7 +159,7 @@ def test_log():
     check_transform(tr.log, Rplusbig)
 
     check_jacobian_det(tr.log, Rplusbig, elemwise=True)
-    check_jacobian_det(tr.log, Vector(Rplusbig, 2), at.dvector, [0, 0], elemwise=True)
+    check_jacobian_det(tr.log, Vector(Rplusbig, 2), pt.dvector, [0, 0], elemwise=True)
 
     vals = get_values(tr.log)
     close_to_logical(vals > 0, True, tol)
@@ -169,7 +169,7 @@ def test_log_exp_m1():
     check_transform(tr.log_exp_m1, Rplusbig)
 
     check_jacobian_det(tr.log_exp_m1, Rplusbig, elemwise=True)
-    check_jacobian_det(tr.log_exp_m1, Vector(Rplusbig, 2), at.dvector, [0, 0], elemwise=True)
+    check_jacobian_det(tr.log_exp_m1, Vector(Rplusbig, 2), pt.dvector, [0, 0], elemwise=True)
 
     vals = get_values(tr.log_exp_m1)
     close_to_logical(vals > 0, True, tol)
@@ -179,7 +179,7 @@ def test_logodds():
     check_transform(tr.logodds, Unit)
 
     check_jacobian_det(tr.logodds, Unit, elemwise=True)
-    check_jacobian_det(tr.logodds, Vector(Unit, 2), at.dvector, [0.5, 0.5], elemwise=True)
+    check_jacobian_det(tr.logodds, Vector(Unit, 2), pt.dvector, [0.5, 0.5], elemwise=True)
 
     vals = get_values(tr.logodds)
     close_to_logical(vals > 0, True, tol)
@@ -191,7 +191,7 @@ def test_lowerbound():
     check_transform(trans, Rplusbig)
 
     check_jacobian_det(trans, Rplusbig, elemwise=True)
-    check_jacobian_det(trans, Vector(Rplusbig, 2), at.dvector, [0, 0], elemwise=True)
+    check_jacobian_det(trans, Vector(Rplusbig, 2), pt.dvector, [0, 0], elemwise=True)
 
     vals = get_values(trans)
     close_to_logical(vals > 0, True, tol)
@@ -202,7 +202,7 @@ def test_upperbound():
     check_transform(trans, Rminusbig)
 
     check_jacobian_det(trans, Rminusbig, elemwise=True)
-    check_jacobian_det(trans, Vector(Rminusbig, 2), at.dvector, [-1, -1], elemwise=True)
+    check_jacobian_det(trans, Vector(Rminusbig, 2), pt.dvector, [-1, -1], elemwise=True)
 
     vals = get_values(trans)
     close_to_logical(vals < 0, True, tol)
@@ -257,19 +257,19 @@ def test_ordered():
         tr.Ordered(2)
 
     check_jacobian_det(
-        tr.univariate_ordered, Vector(R, 2), at.dvector, np.array([0, 0]), elemwise=False
+        tr.univariate_ordered, Vector(R, 2), pt.dvector, np.array([0, 0]), elemwise=False
     )
     check_jacobian_det(
-        tr.multivariate_ordered, Vector(R, 2), at.dvector, np.array([0, 0]), elemwise=False
+        tr.multivariate_ordered, Vector(R, 2), pt.dvector, np.array([0, 0]), elemwise=False
     )
 
-    vals = get_values(tr.univariate_ordered, Vector(R, 3), at.dvector, np.zeros(3))
+    vals = get_values(tr.univariate_ordered, Vector(R, 3), pt.dvector, np.zeros(3))
     close_to_logical(np.diff(vals) >= 0, True, tol)
 
 
 def test_chain_values():
     chain_tranf = tr.Chain([tr.logodds, tr.univariate_ordered])
-    vals = get_values(chain_tranf, Vector(R, 5), at.dvector, np.zeros(5))
+    vals = get_values(chain_tranf, Vector(R, 5), pt.dvector, np.zeros(5))
     close_to_logical(np.diff(vals) >= 0, True, tol)
 
 
@@ -281,7 +281,7 @@ def test_chain_vector_transform():
 @pytest.mark.xfail(reason="Fails due to precision issue. Values just close to expected.")
 def test_chain_jacob_det():
     chain_tranf = tr.Chain([tr.logodds, tr.univariate_ordered])
-    check_jacobian_det(chain_tranf, Vector(R, 4), at.dvector, np.zeros(4), elemwise=False)
+    check_jacobian_det(chain_tranf, Vector(R, 4), pt.dvector, np.zeros(4), elemwise=False)
 
 
 class TestElementWiseLogp(SeededTest):
@@ -296,13 +296,13 @@ def check_transform_elementwise_logp(self, model):
         x = model.free_RVs[0]
         x_val_transf = model.rvs_to_values[x]
 
-        pt = model.initial_point(0)
-        test_array_transf = floatX(np.random.randn(*pt[x_val_transf.name].shape))
+        point = model.initial_point(0)
+        test_array_transf = floatX(np.random.randn(*point[x_val_transf.name].shape))
         transform = model.rvs_to_transforms[x]
         test_array_untransf = transform.backward(test_array_transf, *x.owner.inputs).eval()
 
         # Create input variable with same dimensionality as untransformed test_array
-        x_val_untransf = at.constant(test_array_untransf).type()
+        x_val_untransf = pt.constant(test_array_untransf).type()
 
         jacob_det = transform.log_jac_det(test_array_transf, *x.owner.inputs)
         assert model.logp(x, sum=False)[0].ndim == x.ndim == jacob_det.ndim
@@ -332,13 +332,13 @@ def check_vectortransform_elementwise_logp(self, model):
         x = model.free_RVs[0]
         x_val_transf = model.rvs_to_values[x]
 
-        pt = model.initial_point(0)
-        test_array_transf = floatX(np.random.randn(*pt[x_val_transf.name].shape))
+        point = model.initial_point(0)
+        test_array_transf = floatX(np.random.randn(*point[x_val_transf.name].shape))
         transform = model.rvs_to_transforms[x]
         test_array_untransf = transform.backward(test_array_transf, *x.owner.inputs).eval()
 
         # Create input variable with same dimensionality as untransformed test_array
-        x_val_untransf = at.constant(test_array_untransf).type()
+        x_val_untransf = pt.constant(test_array_untransf).type()
 
         jacob_det = transform.log_jac_det(test_array_transf, *x.owner.inputs)
         # Original distribution is univariate
@@ -417,8 +417,8 @@ def test_beta(self, a, b, size):
     def test_uniform(self, lower, upper, size):
         def transform_params(*inputs):
             _, _, _, lower, upper = inputs
-            lower = at.as_tensor_variable(lower) if lower is not None else None
-            upper = at.as_tensor_variable(upper) if upper is not None else None
+            lower = pt.as_tensor_variable(lower) if lower is not None else None
+            upper = pt.as_tensor_variable(upper) if upper is not None else None
             return lower, upper
 
         interval = tr.Interval(bounds_fn=transform_params)
@@ -438,8 +438,8 @@ def transform_params(*inputs):
     def test_triangular(self, lower, c, upper, size):
         def transform_params(*inputs):
             _, _, _, lower, _, upper = inputs
-            lower = at.as_tensor_variable(lower) if lower is not None else None
-            upper = at.as_tensor_variable(upper) if upper is not None else None
+            lower = pt.as_tensor_variable(lower) if lower is not None else None
+            upper = pt.as_tensor_variable(upper) if upper is not None else None
             return lower, upper
 
         interval = tr.Interval(bounds_fn=transform_params)
@@ -533,8 +533,8 @@ def test_beta_ordered(self, a, b, size):
     def test_uniform_ordered(self, lower, upper, size):
         def transform_params(*inputs):
             _, _, _, lower, upper = inputs
-            lower = at.as_tensor_variable(lower) if lower is not None else None
-            upper = at.as_tensor_variable(upper) if upper is not None else None
+            lower = pt.as_tensor_variable(lower) if lower is not None else None
+            upper = pt.as_tensor_variable(upper) if upper is not None else None
             return lower, upper
 
         interval = tr.Interval(bounds_fn=transform_params)
@@ -613,9 +613,9 @@ def test_interval_transform_raises():
         tr.Interval(None, None)
 
     with pytest.raises(ValueError, match="Interval bounds must be constant values"):
-        tr.Interval(at.constant(5) + 1, None)
+        tr.Interval(pt.constant(5) + 1, None)
 
-    assert tr.Interval(at.constant(5), None)
+    assert tr.Interval(pt.constant(5), None)
 
 
 def test_discrete_trafo():
diff --git a/tests/distributions/test_truncated.py b/tests/distributions/test_truncated.py
index 39ebbb6cc9..7502260dc8 100644
--- a/tests/distributions/test_truncated.py
+++ b/tests/distributions/test_truncated.py
@@ -13,7 +13,7 @@
 #   limitations under the License.
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy
 
@@ -26,10 +26,10 @@
 from pymc.distributions.truncated import Truncated, TruncatedRV, _truncated
 from pymc.exceptions import TruncationError
 from pymc.logprob.abstract import _icdf
-from pymc.logprob.joint_logprob import logp
+from pymc.logprob.basic import logp
 from pymc.logprob.transforms import IntervalTransform
 from pymc.logprob.utils import ParameterValueError
-from tests.distributions.util import assert_moment_is_expected
+from pymc.testing import assert_moment_is_expected
 
 
 class IcdfNormalRV(NormalRV):
@@ -71,7 +71,7 @@ def _icdf_not_implemented(*args, **kwargs):
 @pytest.mark.parametrize("shape_info", ("shape", "dims", "observed"))
 def test_truncation_specialized_op(shape_info):
     rng = pytensor.shared(np.random.default_rng())
-    x = at.random.normal(0, 10, rng=rng, name="x")
+    x = pt.random.normal(0, 10, rng=rng, name="x")
 
     with Model(coords={"dim": range(100)}) as m:
         if shape_info == "shape":
@@ -95,7 +95,7 @@ def test_truncation_specialized_op(shape_info):
     # Test RNG is not reused
     assert xt.owner.inputs[0] is not rng
 
-    lower_upper = at.stack(xt.owner.inputs[5:])
+    lower_upper = pt.stack(xt.owner.inputs[5:])
     assert np.all(lower_upper.eval() == [5, 15])
 
 
@@ -174,7 +174,6 @@ def test_truncation_discrete_random(op_type, lower, upper):
     x = geometric_op(p, name="x", size=500)
     xt = Truncated.dist(x, lower=lower, upper=upper)
     assert isinstance(xt.owner.op, TruncatedRV)
-    assert xt.type.dtype == x.type.dtype
 
     xt_draws = draw(xt)
     assert np.all(xt_draws >= lower)
@@ -235,24 +234,24 @@ def ref_xt_logpmf(value):
 
 def test_truncation_exceptions():
     with pytest.raises(ValueError, match="lower and upper cannot both be None"):
-        Truncated.dist(at.random.normal())
+        Truncated.dist(pt.random.normal())
 
     # Truncation does not work with SymbolicRV inputs
     with pytest.raises(
         NotImplementedError,
         match="Truncation not implemented for SymbolicRandomVariable CensoredRV",
     ):
-        Truncated.dist(Censored.dist(at.random.normal(), lower=-1, upper=1), -1, 1)
+        Truncated.dist(Censored.dist(pt.random.normal(), lower=-1, upper=1), -1, 1)
 
     with pytest.raises(
         NotImplementedError,
         match="Truncation not implemented for multivariate distributions",
     ):
-        Truncated.dist(at.random.dirichlet([1, 1, 1]), -1, 1)
+        Truncated.dist(pt.random.dirichlet([1, 1, 1]), -1, 1)
 
 
 def test_truncation_logprob_bound_check():
-    x = at.random.normal(name="x")
+    x = pt.random.normal(name="x")
     xt = Truncated.dist(x, lower=5, upper=-5)
     with pytest.raises(ParameterValueError):
         logp(xt, 0).eval()
diff --git a/tests/gp/test_cov.py b/tests/gp/test_cov.py
index 495120b5f2..e671ba8fc3 100644
--- a/tests/gp/test_cov.py
+++ b/tests/gp/test_cov.py
@@ -15,7 +15,7 @@
 import numpy as np
 import numpy.testing as npt
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 import pymc as pm
@@ -71,7 +71,7 @@ def test_rightadd_matrix(self):
 
     def test_leftadd_matrixt(self):
         X = np.linspace(0, 1, 10)[:, None]
-        M = 2 * at.ones((10, 10))
+        M = 2 * pt.ones((10, 10))
         with pm.Model() as model:
             cov = M + pm.gp.cov.ExpQuad(1, 0.1)
         K = cov(X).eval()
@@ -181,6 +181,68 @@ def test_inv_rightprod(self):
             cov = M + pm.gp.cov.ExpQuad(1, 1.0)
 
 
+class TestCovPSD:
+    def test_covpsd_add(self):
+        L = 10.0
+        omega = np.pi * np.arange(1, 101) / (2 * L)
+        with pm.Model() as model:
+            cov1 = 2 * pm.gp.cov.ExpQuad(1, 0.1)
+            cov2 = 5 * pm.gp.cov.ExpQuad(1, 1.0)
+            cov = cov1 + cov2
+        psd1 = cov1.power_spectral_density(omega[:, None]).eval()
+        psd2 = cov2.power_spectral_density(omega[:, None]).eval()
+        psd = cov.power_spectral_density(omega[:, None]).eval()
+        npt.assert_allclose(psd, psd1 + psd2)
+
+    def test_copsd_multiply(self):
+        # This could be implemented via convolution
+        L = 10.0
+        omega = np.pi * np.arange(1, 101) / (2 * L)
+        with pm.Model() as model:
+            cov1 = 2 * pm.gp.cov.ExpQuad(1, ls=1)
+            cov2 = pm.gp.cov.ExpQuad(1, ls=1)
+
+        msg = "The power spectral density of products of covariance functions is not implemented"
+        with pytest.raises(NotImplementedError, match=msg):
+            psd = (cov1 * cov2).power_spectral_density(omega[:, None]).eval()
+
+    def test_covpsd_nonstationary1(self):
+        L = 10.0
+        omega = np.pi * np.arange(1, 101) / (2 * L)
+        with pm.Model() as model:
+            cov = 2 * pm.gp.cov.Linear(1, c=5)
+
+        msg = "can only be calculated for `Stationary` covariance functions."
+        with pytest.raises(ValueError, match=msg):
+            psd = cov.power_spectral_density(omega[:, None]).eval()
+
+    def test_covpsd_nonstationary2(self):
+        L = 10.0
+        omega = np.pi * np.arange(1, 101) / (2 * L)
+        with pm.Model() as model:
+            cov = 2 * pm.gp.cov.ExpQuad(1, ls=1) + 10.0
+
+        # Even though this should error, this isnt the appropriate message.  The actual problem
+        # is because the covariance function is non-stationary. The underlying bug is due to
+        # `Constant` covariances not having an input_dim.
+        msg = "All covariances must have the same `input_dim`."
+        with pytest.raises(ValueError, match=msg):
+            psd = cov.power_spectral_density(omega[:, None]).eval()
+
+    def test_covpsd_notimplemented(self):
+        class NewStationaryCov(pm.gp.cov.Stationary):
+            pass
+
+        L = 10.0
+        omega = np.pi * np.arange(1, 101) / (2 * L)
+        with pm.Model() as model:
+            cov = 2 * NewStationaryCov(1, ls=1)
+
+        msg = "No power spectral density method has been implemented"
+        with pytest.raises(NotImplementedError, match=msg):
+            psd = cov.power_spectral_density(omega[:, None]).eval()
+
+
 class TestCovExponentiation:
     def test_symexp_cov(self):
         X = np.linspace(0, 1, 10)[:, None]
@@ -207,7 +269,7 @@ def test_covexp_numpy(self):
     def test_covexp_pytensor(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
-            a = at.alloc(2.0, 1, 1)
+            a = pt.alloc(2.0, 1, 1)
             cov = pm.gp.cov.ExpQuad(1, 0.1) ** a
         K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.53940**2, atol=1e-3)
@@ -228,7 +290,9 @@ def test_covexp_shared(self):
 
     def test_invalid_covexp(self):
         X = np.linspace(0, 1, 10)[:, None]
-        with pytest.raises(ValueError, match=r"can only be exponentiated by a scalar value"):
+        with pytest.raises(
+            ValueError, match=r"A covariance function can only be exponentiated by a scalar value"
+        ):
             with pm.Model() as model:
                 a = np.array([[1.0, 2.0]])
                 cov = pm.gp.cov.ExpQuad(1, 0.1) ** a
@@ -262,7 +326,7 @@ def test_multiops(self):
                 + pm.gp.cov.ExpQuad(1, 0.1)
                 + pm.gp.cov.ExpQuad(1, 0.1) * pm.gp.cov.ExpQuad(1, 0.1)
             )
-            cov2 = pm.gp.cov.ExpQuad(1, 0.1) * pm.gp.cov.ExpQuad(2, 0.1)
+            cov2 = pm.gp.cov.ExpQuad(2, 0.1) * pm.gp.cov.ExpQuad(2, 0.1)
             cov = pm.gp.cov.Kron([cov1, cov2])
         K_true = kronecker(cov1(X1).eval(), cov2(X2).eval()).eval()
         K = cov(X).eval()
@@ -373,6 +437,17 @@ def test_inv_lengthscale(self):
         Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
+    def test_psd(self):
+        # compare to simple 1d formula
+        X = np.linspace(0, 1, 10)[:, None]
+        omega = np.linspace(0, 2, 50)
+        ell = 2.0
+        true_1d_psd = np.sqrt(2 * np.pi * np.square(ell)) * np.exp(-0.5 * np.square(ell * omega))
+        test_1d_psd = (
+            pm.gp.cov.ExpQuad(1, ls=ell).power_spectral_density(omega[:, None]).flatten().eval()
+        )
+        npt.assert_allclose(true_1d_psd, test_1d_psd, atol=1e-5)
+
 
 class TestWhiteNoise:
     def test_1d(self):
@@ -449,6 +524,18 @@ def test_1d(self):
         Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
+    def test_psd(self):
+        # compare to simple 1d formula
+        X = np.linspace(0, 1, 10)[:, None]
+        omega = np.linspace(0, 2, 50)
+        ell = 2.0
+        lamda = np.sqrt(5) / ell
+        true_1d_psd = (16.0 / 3.0) * np.power(lamda, 5) * np.power(lamda**2 + omega**2, -3)
+        test_1d_psd = (
+            pm.gp.cov.Matern52(1, ls=ell).power_spectral_density(omega[:, None]).flatten().eval()
+        )
+        npt.assert_allclose(true_1d_psd, test_1d_psd, atol=1e-5)
+
 
 class TestMatern32:
     def test_1d(self):
@@ -463,6 +550,18 @@ def test_1d(self):
         Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
+    def test_psd(self):
+        # compare to simple 1d formula
+        X = np.linspace(0, 1, 10)[:, None]
+        omega = np.linspace(0, 2, 50)
+        ell = 2.0
+        lamda = np.sqrt(3) / ell
+        true_1d_psd = 4 * np.power(lamda, 3) * np.power(lamda**2 + omega**2, -2)
+        test_1d_psd = (
+            pm.gp.cov.Matern32(1, ls=ell).power_spectral_density(omega[:, None]).flatten().eval()
+        )
+        npt.assert_allclose(true_1d_psd, test_1d_psd, atol=1e-5)
+
 
 class TestMatern12:
     def test_1d(self):
@@ -538,7 +637,7 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
 
         def warp_func(x, a, b, c):
-            return x + (a * at.tanh(b * (x - c)))
+            return x + (a * pt.tanh(b * (x - c)))
 
         with pm.Model() as model:
             cov_m52 = pm.gp.cov.Matern52(1, 0.2)
@@ -564,7 +663,7 @@ def test_1d(self):
         X = np.linspace(0, 2, 10)[:, None]
 
         def tanh_func(x, x1, x2, w, x0):
-            return (x1 + x2) / 2.0 - (x1 - x2) / 2.0 * at.tanh((x - x0) / w)
+            return (x1 + x2) / 2.0 - (x1 - x2) / 2.0 * pt.tanh((x - x0) / w)
 
         with pm.Model() as model:
             cov = pm.gp.cov.Gibbs(1, tanh_func, args=(0.05, 0.6, 0.4, 1.0))
diff --git a/tests/gp/test_hsgp_approx.py b/tests/gp/test_hsgp_approx.py
new file mode 100644
index 0000000000..b6f03a4acc
--- /dev/null
+++ b/tests/gp/test_hsgp_approx.py
@@ -0,0 +1,210 @@
+#   Copyright 2023 The PyMC Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import arviz as az
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+import pytest
+import scipy as sp
+
+from scipy.spatial import distance
+
+import pymc as pm
+
+
+def build_mmd_func(sample1, sample2):
+    """Build a PyTensor function that calculates the minimum mean discrepancy (MMD) statistic."""
+
+    assert sample1.shape[1] == sample2.shape[1]
+
+    s1 = pt.matrix(name="s1", shape=sample1.shape)
+    s2 = pt.matrix(name="s2", shape=sample2.shape)
+
+    X = np.concatenate((sample1, sample2), axis=0)
+    test_ell = np.median(distance.pdist(X)) / 2
+
+    K = pm.gp.cov.ExpQuad(sample1.shape[1], ls=test_ell)
+    Kxx = K(s1)
+    Kyy = K(s2)
+    Kxy = K(s1, s2)
+
+    n_x, n_y = s1.shape[0], s2.shape[0]
+    mmd = (
+        (pt.sum(Kxx) / (n_x * (n_x - 1)))
+        + (pt.sum(Kyy) / (n_y * (n_y - 1)))
+        - 2 * pt.sum(Kxy) / (n_x * n_y)
+    )
+
+    calc_mmd = pytensor.function(inputs=[s1, s2], outputs=mmd)
+    return calc_mmd
+
+
+def two_sample_test(sample1, sample2, n_sims=1000, alpha=0.05):
+    """Calculate test whose null hypothesis is that two sets of samples were drawn from
+    the same distribution.
+
+    Largely taken from https://torchdrift.org/notebooks/note_on_mmd.html
+    """
+    # build function to calculate mmd
+    calc_mmd = build_mmd_func(sample1, sample2)
+
+    # simulate test statistic under null hypothesis
+    X = np.concatenate((sample1, sample2), axis=0)
+    half_N = int(X.shape[0] // 2)
+    ix = np.arange(half_N * 2)
+
+    h0 = []
+    for i in range(n_sims):
+        np.random.shuffle(ix)
+        X = X[ix, :]
+        h0.append(calc_mmd(X[:half_N, :], X[half_N:, :]))
+    h0 = np.asarray(h0)
+    critical_value = np.percentile(h0, 100 * (1 - alpha))
+    mmd = calc_mmd(sample1, sample2)
+    return h0, mmd, critical_value, mmd > critical_value
+
+
+class TestHSGP:
+    @pytest.fixture
+    def rng(self):
+        return np.random.RandomState(10)
+
+    @pytest.fixture
+    def data(self, rng):
+        # 1D dataset
+        X1 = np.linspace(-5, 5, 100)[:, None]
+
+        # 3D dataset
+        x1, x2, x3 = np.meshgrid(
+            np.linspace(0, 10, 5), np.linspace(20, 30, 5), np.linspace(10, 20, 5)
+        )
+        X2 = np.vstack([x1.flatten(), x2.flatten(), x3.flatten()]).T
+        return X1, X2
+
+    @pytest.fixture
+    def X1(self, data):
+        return data[0]
+
+    @pytest.fixture
+    def X2(self, data):
+        return data[1]
+
+    @pytest.fixture
+    def model(self):
+        return pm.Model()
+
+    @pytest.fixture
+    def cov_func(self):
+        return pm.gp.cov.ExpQuad(1, ls=1)
+
+    @pytest.fixture
+    def gp(self, cov_func):
+        gp = pm.gp.Latent(cov_func=cov_func)
+        return gp
+
+    def test_set_boundaries_1d(self, X1):
+        X1s = X1 - np.mean(X1, axis=0)
+        L = pm.gp.hsgp_approx.set_boundary(X1s, c=2).eval()
+        assert np.all(L == 10)
+
+    def test_set_boundaries_3d(self, X2):
+        X2s = X2 - np.mean(X2, axis=0)
+        L = pm.gp.hsgp_approx.set_boundary(X2s, c=2).eval()
+        assert np.all(L == 10)
+
+    def test_parametrization(self):
+        err_msg = "`m` and L, if provided, must be sequences with one element per active dimension"
+
+        with pytest.raises(ValueError, match=err_msg):
+            # m must be a list
+            cov_func = pm.gp.cov.ExpQuad(1, ls=0.1)
+            pm.gp.HSGP(m=500, c=2, cov_func=cov_func)
+
+        with pytest.raises(ValueError, match=err_msg):
+            # m must have same length as L
+            cov_func = pm.gp.cov.ExpQuad(2, ls=[1, 2])
+            pm.gp.HSGP(m=[500], L=[12, 12], cov_func=cov_func)
+
+        with pytest.raises(ValueError, match=err_msg):
+            # m must have same length as L, and match number of active dims of cov_func
+            cov_func = pm.gp.cov.ExpQuad(1, ls=0.1)
+            pm.gp.HSGP(m=[500], L=[12, 12], cov_func=cov_func)
+
+        # pass without error, cov_func has 2 active dimensions, c given as scalar
+        cov_func = pm.gp.cov.ExpQuad(3, ls=[1, 2], active_dims=[0, 2])
+        pm.gp.HSGP(m=[50, 50], c=2, cov_func=cov_func)
+
+        # pass without error, all have two dimensions
+        cov_func = pm.gp.cov.ExpQuad(2, ls=[1, 2])
+        pm.gp.HSGP(m=[50, 50], L=[12, 12], cov_func=cov_func)
+
+    @pytest.mark.parametrize("drop_first", [True, False])
+    def test_parametrization_drop_first(self, model, cov_func, X1, drop_first):
+        n_basis = 100
+        with model:
+            gp = pm.gp.HSGP(m=[n_basis], c=4.0, cov_func=cov_func, drop_first=drop_first)
+            gp.prior("f1", X1)
+
+            n_coeffs = model.f1_hsgp_coeffs_.type.shape[0]
+            if drop_first:
+                assert (
+                    n_coeffs == n_basis - 1
+                ), f"one basis vector should have been dropped, {n_coeffs}"
+            else:
+                assert n_coeffs == n_basis, "one was dropped when it shouldn't have been"
+
+    @pytest.mark.parametrize("parameterization", ["centered", "noncentered"])
+    def test_prior(self, model, cov_func, X1, parameterization):
+        """Compare HSGP prior to unapproximated GP prior, pm.gp.Latent.  Draw samples from the
+        prior and compare them using MMD two sample test.  Tests both centered and non-centered
+        parameterizations.
+        """
+        with model:
+            hsgp = pm.gp.HSGP(m=[200], c=2.0, parameterization=parameterization, cov_func=cov_func)
+            f1 = hsgp.prior("f1", X=X1)
+
+            gp = pm.gp.Latent(cov_func=cov_func)
+            f2 = gp.prior("f2", X=X1)
+
+            idata = pm.sample_prior_predictive(samples=1000)
+
+        samples1 = az.extract(idata.prior["f1"])["f1"].values.T
+        samples2 = az.extract(idata.prior["f2"])["f2"].values.T
+
+        h0, mmd, critical_value, reject = two_sample_test(
+            samples1, samples2, n_sims=500, alpha=0.01
+        )
+        assert not reject, "H0 was rejected, even though HSGP and GP priors should match."
+
+    @pytest.mark.parametrize("parameterization", ["centered", "noncentered"])
+    def test_conditional(self, model, cov_func, X1, parameterization):
+        """Compare HSGP conditional to unapproximated GP prior, pm.gp.Latent.  Draw samples from the
+        prior and compare them using MMD two sample test.  Tests both centered and non-centered
+        parameterizations.  The conditional should match the prior when no data is observed.
+        """
+        with model:
+            hsgp = pm.gp.HSGP(m=[100], c=2.0, parameterization=parameterization, cov_func=cov_func)
+            f = hsgp.prior("f", X=X1)
+            fc = hsgp.conditional("fc", Xnew=X1)
+
+            idata = pm.sample_prior_predictive(samples=1000)
+
+        samples1 = az.extract(idata.prior["f"])["f"].values.T
+        samples2 = az.extract(idata.prior["fc"])["fc"].values.T
+
+        h0, mmd, critical_value, reject = two_sample_test(
+            samples1, samples2, n_sims=500, alpha=0.01
+        )
+        assert not reject, "H0 was rejected, even though HSGP prior and conditional should match."
diff --git a/tests/gp/test_util.py b/tests/gp/test_util.py
index 9961a27e62..c2aaa25d1e 100644
--- a/tests/gp/test_util.py
+++ b/tests/gp/test_util.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 import numpy.testing as npt
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 import pymc as pm
@@ -61,7 +61,7 @@ def test_kmeans(self):
         Xu = pm.gp.util.kmeans_inducing_points(2, X).flatten()
         npt.assert_allclose(np.asarray(self.centers), np.sort(Xu), rtol=0.05)
 
-        X = at.as_tensor_variable(self.x[:, None])
+        X = pt.as_tensor_variable(self.x[:, None])
         Xu = pm.gp.util.kmeans_inducing_points(2, X).flatten()
         npt.assert_allclose(np.asarray(self.centers), np.sort(Xu), rtol=0.05)
 
@@ -84,7 +84,7 @@ def test_basic_replace(self):
 
     def test_replace_no_inputs_needed(self):
         with pm.Model() as model:
-            a = at.as_tensor_variable(2.0)
+            a = pt.as_tensor_variable(2.0)
             b = 1.0 + a
             c = a * b
             (c_val,) = pm.gp.util.replace_with_values([c], replacements={"x": 100})
diff --git a/tests/helpers.py b/tests/helpers.py
index 343861787b..f3aa4f914e 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -24,34 +24,14 @@
 import pytensor
 
 from pytensor.gradient import verify_grad as at_verify_grad
-from pytensor.graph import ancestors
-from pytensor.graph.rewriting.basic import in2out
-from pytensor.tensor.random.op import RandomVariable
 
 import pymc as pm
 
-from pymc.pytensorf import local_check_parameter_to_ninf_switch
+from pymc.testing import fast_unstable_sampling_mode
 from tests.checks import close_to
 from tests.models import mv_simple, mv_simple_coarse
 
 
-class SeededTest:
-    random_seed = 20160911
-    random_state = None
-
-    @classmethod
-    def setup_class(cls):
-        nr.seed(cls.random_seed)
-
-    def setup_method(self):
-        nr.seed(self.random_seed)
-
-    def get_random_state(self, reset=False):
-        if self.random_state is None or reset:
-            self.random_state = nr.RandomState(self.random_seed)
-        return self.random_state
-
-
 class LoggingHandler(BufferingHandler):
     def __init__(self, matcher):
         # BufferingHandler takes a "capacity" argument
@@ -112,12 +92,6 @@ def match_value(self, k, dv, v):
         return result
 
 
-def select_by_precision(float64, float32):
-    """Helper function to choose reasonable decimal cutoffs for different floatX modes."""
-    decimal = float64 if pytensor.config.floatX == "float64" else float32
-    return decimal
-
-
 @contextlib.contextmanager
 def not_raises():
     yield
@@ -137,21 +111,6 @@ def assert_random_state_equal(state1, state2):
             assert field1 == field2
 
 
-# This mode can be used for tests where model compilations takes the bulk of the runtime
-# AND where we don't care about posterior numerical or sampling stability (e.g., when
-# all that matters are the shape of the draws or deterministic values of observed data).
-# DO NOT USE UNLESS YOU HAVE A GOOD REASON TO!
-fast_unstable_sampling_mode = (
-    pytensor.compile.mode.FAST_COMPILE
-    # Remove slow rewrite phases
-    .excluding("canonicalize", "specialize")
-    # Include necessary rewrites for proper logp handling
-    .including("remove_TransformedVariables").register(
-        (in2out(local_check_parameter_to_ninf_switch), -1)
-    )
-)
-
-
 class StepMethodTester:
     def setup_class(self):
         self.temp_dir = tempfile.mkdtemp()
@@ -213,8 +172,3 @@ def continuous_steps(self, step, step_kwargs):
             assert {m.rvs_to_values[c1], m.rvs_to_values[c2]} == set(
                 step([c1, c2], **step_kwargs).vars
             )
-
-
-def assert_no_rvs(var):
-    assert not any(isinstance(v.owner.op, RandomVariable) for v in ancestors([var]) if v.owner)
-    return var
diff --git a/tests/logprob/test_abstract.py b/tests/logprob/test_abstract.py
index 43dc39333a..21ab0c82d8 100644
--- a/tests/logprob/test_abstract.py
+++ b/tests/logprob/test_abstract.py
@@ -37,7 +37,7 @@
 import re
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.stats.distributions as sp
 
@@ -51,10 +51,11 @@
     MeasurableVariable,
     UnmeasurableVariable,
     _get_measurable_outputs,
+    _logcdf_helper,
     assign_custom_measurable_outputs,
-    logcdf,
     noop_measurable_outputs_fn,
 )
+from pymc.logprob.basic import logcdf
 
 
 def assert_equal_hash(classA, classB):
@@ -114,7 +115,7 @@ def test_unmeasurable_meta_hash_reassignment():
 
 
 def test_assign_custom_measurable_outputs():
-    srng = at.random.RandomStream(seed=2320)
+    srng = pt.random.RandomStream(seed=2320)
 
     X_rv = srng.normal(-10.0, 0.1, name="X")
     Y_rv = srng.normal(10.0, 0.1, name="Y")
@@ -155,13 +156,13 @@ class TestMeasurableElemwise(MeasurableElemwise):
 
 
 def test_logcdf_helper():
-    value = at.vector("value")
+    value = pt.vector("value")
     x = pm.Normal.dist(0, 1)
 
-    x_logcdf = logcdf(x, value)
+    x_logcdf = _logcdf_helper(x, value)
     np.testing.assert_almost_equal(x_logcdf.eval({value: [0, 1]}), sp.norm(0, 1).logcdf([0, 1]))
 
-    x_logcdf = logcdf(x, [0, 1])
+    x_logcdf = _logcdf_helper(x, [0, 1])
     np.testing.assert_almost_equal(x_logcdf.eval(), sp.norm(0, 1).logcdf([0, 1]))
 
 
diff --git a/tests/logprob/test_joint_logprob.py b/tests/logprob/test_basic.py
similarity index 55%
rename from tests/logprob/test_joint_logprob.py
rename to tests/logprob/test_basic.py
index f0336821b7..052e6031ca 100644
--- a/tests/logprob/test_joint_logprob.py
+++ b/tests/logprob/test_basic.py
@@ -38,7 +38,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.stats.distributions as sp
 
@@ -55,108 +55,115 @@
 
 import pymc as pm
 
-from pymc.logprob.abstract import logprob
-from pymc.logprob.joint_logprob import factorized_joint_logprob, joint_logp
+from pymc.logprob.basic import factorized_joint_logprob, icdf, joint_logp, logcdf, logp
+from pymc.logprob.transforms import LogTransform
 from pymc.logprob.utils import rvs_to_value_vars, walk_model
-from tests.helpers import assert_no_rvs
-from tests.logprob.utils import joint_logprob
+from pymc.pytensorf import replace_rvs_by_values
+from pymc.testing import assert_no_rvs
 
 
-def test_joint_logprob_basic():
-    # A simple check for when `joint_logprob` is the same as `logprob`
-    a = at.random.uniform(0.0, 1.0)
+def test_factorized_joint_logprob_basic():
+    # A simple check for when `factorized_joint_logprob` is the same as `logprob`
+    a = pt.random.uniform(0.0, 1.0)
     a.name = "a"
     a_value_var = a.clone()
 
-    a_logp = joint_logprob({a: a_value_var}, sum=False)
-    a_logp_exp = logprob(a, a_value_var)
+    a_logp = factorized_joint_logprob({a: a_value_var})
+    a_logp_comb = tuple(a_logp.values())[0]
+    a_logp_exp = logp(a, a_value_var)
 
-    assert equal_computations([a_logp], [a_logp_exp])
+    assert equal_computations([a_logp_comb], [a_logp_exp])
 
     # Let's try a hierarchical model
-    sigma = at.random.invgamma(0.5, 0.5)
-    Y = at.random.normal(0.0, sigma)
+    sigma = pt.random.invgamma(0.5, 0.5)
+    Y = pt.random.normal(0.0, sigma)
 
     sigma_value_var = sigma.clone()
     y_value_var = Y.clone()
 
-    total_ll = joint_logprob({Y: y_value_var, sigma: sigma_value_var}, sum=False)
+    total_ll = factorized_joint_logprob({Y: y_value_var, sigma: sigma_value_var})
+    total_ll_combined = pt.add(*total_ll.values())
 
     # We need to replace the reference to `sigma` in `Y` with its value
     # variable
-    ll_Y = logprob(Y, y_value_var)
+    ll_Y = logp(Y, y_value_var)
     (ll_Y,), _ = rvs_to_value_vars(
         [ll_Y],
         initial_replacements={sigma: sigma_value_var},
     )
-    total_ll_exp = logprob(sigma, sigma_value_var) + ll_Y
+    total_ll_exp = logp(sigma, sigma_value_var) + ll_Y
 
-    assert equal_computations([total_ll], [total_ll_exp])
+    assert equal_computations([total_ll_combined], [total_ll_exp])
 
     # Now, make sure we can compute a joint log-probability for a hierarchical
     # model with some non-`RandomVariable` nodes
-    c = at.random.normal()
+    c = pt.random.normal()
     c.name = "c"
     b_l = c * a + 2.0
-    b = at.random.uniform(b_l, b_l + 1.0)
+    b = pt.random.uniform(b_l, b_l + 1.0)
     b.name = "b"
 
     b_value_var = b.clone()
     c_value_var = c.clone()
 
-    b_logp = joint_logprob({a: a_value_var, b: b_value_var, c: c_value_var})
+    b_logp = factorized_joint_logprob({a: a_value_var, b: b_value_var, c: c_value_var})
+    b_logp_combined = pt.sum([pt.sum(factor) for factor in b_logp.values()])
 
     # There shouldn't be any `RandomVariable`s in the resulting graph
-    assert_no_rvs(b_logp)
+    assert_no_rvs(b_logp_combined)
 
-    res_ancestors = list(walk_model((b_logp,), walk_past_rvs=True))
+    res_ancestors = list(walk_model((b_logp_combined,), walk_past_rvs=True))
     assert b_value_var in res_ancestors
     assert c_value_var in res_ancestors
     assert a_value_var in res_ancestors
 
 
-def test_joint_logprob_multi_obs():
-    a = at.random.uniform(0.0, 1.0)
-    b = at.random.normal(0.0, 1.0)
+def test_factorized_joint_logprob_multi_obs():
+    a = pt.random.uniform(0.0, 1.0)
+    b = pt.random.normal(0.0, 1.0)
 
     a_val = a.clone()
     b_val = b.clone()
 
-    logp = joint_logprob({a: a_val, b: b_val}, sum=False)
-    logp_exp = logprob(a, a_val) + logprob(b, b_val)
+    logp_res = factorized_joint_logprob({a: a_val, b: b_val})
+    logp_res_combined = pt.add(*logp_res.values())
+    logp_exp = logp(a, a_val) + logp(b, b_val)
 
-    assert equal_computations([logp], [logp_exp])
+    assert equal_computations([logp_res_combined], [logp_exp])
 
-    x = at.random.normal(0, 1)
-    y = at.random.normal(x, 1)
+    x = pt.random.normal(0, 1)
+    y = pt.random.normal(x, 1)
 
     x_val = x.clone()
     y_val = y.clone()
 
-    logp = joint_logprob({x: x_val, y: y_val})
-    exp_logp = joint_logprob({x: x_val, y: y_val})
+    logp_res = factorized_joint_logprob({x: x_val, y: y_val})
+    exp_logp = factorized_joint_logprob({x: x_val, y: y_val})
+    logp_res_comb = pt.sum([pt.sum(factor) for factor in logp_res.values()])
+    exp_logp_comb = pt.sum([pt.sum(factor) for factor in exp_logp.values()])
 
-    assert equal_computations([logp], [exp_logp])
+    assert equal_computations([logp_res_comb], [exp_logp_comb])
 
 
-def test_joint_logprob_diff_dims():
-    M = at.matrix("M")
-    x = at.random.normal(0, 1, size=M.shape[1], name="X")
-    y = at.random.normal(M.dot(x), 1, name="Y")
+def test_factorized_joint_logprob_diff_dims():
+    M = pt.matrix("M")
+    x = pt.random.normal(0, 1, size=M.shape[1], name="X")
+    y = pt.random.normal(M.dot(x), 1, name="Y")
 
     x_vv = x.clone()
     x_vv.name = "x"
     y_vv = y.clone()
     y_vv.name = "y"
 
-    logp = joint_logprob({x: x_vv, y: y_vv})
+    logp = factorized_joint_logprob({x: x_vv, y: y_vv})
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
 
     M_val = np.random.normal(size=(10, 3))
     x_val = np.random.normal(size=(3,))
     y_val = np.random.normal(size=(10,))
 
     point = {M: M_val, x_vv: x_val, y_vv: y_val}
-    logp_val = logp.eval(point)
+    logp_val = logp_combined.eval(point)
 
     exp_logp_val = (
         sp.norm.logpdf(x_val, 0, 1).sum() + sp.norm.logpdf(y_val, M_val.dot(x_val), 1).sum()
@@ -170,95 +177,53 @@ def test_incsubtensor_original_values_output_dict():
     the logprob factor
     """
 
-    base_rv = at.random.normal(0, 1, size=2)
-    rv = at.set_subtensor(base_rv[0], 5)
+    base_rv = pt.random.normal(0, 1, size=2)
+    rv = pt.set_subtensor(base_rv[0], 5)
     vv = rv.clone()
 
     logp_dict = factorized_joint_logprob({rv: vv})
     assert vv in logp_dict
 
 
-def test_joint_logprob_subtensor():
-    """Make sure we can compute a joint log-probability for ``Y[I]`` where ``Y`` and ``I`` are random variables."""
-
-    size = 5
-
-    mu_base = np.power(10, np.arange(np.prod(size))).reshape(size)
-    mu = np.stack([mu_base, -mu_base])
-    sigma = 0.001
-    rng = pytensor.shared(np.random.RandomState(232), borrow=True)
-
-    A_rv = at.random.normal(mu, sigma, rng=rng)
-    A_rv.name = "A"
-
-    p = 0.5
-
-    I_rv = at.random.bernoulli(p, size=size, rng=rng)
-    I_rv.name = "I"
-
-    A_idx = A_rv[I_rv, at.ogrid[A_rv.shape[-1] :]]
-
-    assert isinstance(A_idx.owner.op, (Subtensor, AdvancedSubtensor, AdvancedSubtensor1))
-
-    A_idx_value_var = A_idx.type()
-    A_idx_value_var.name = "A_idx_value"
-
-    I_value_var = I_rv.type()
-    I_value_var.name = "I_value"
-
-    A_idx_logp = joint_logprob({A_idx: A_idx_value_var, I_rv: I_value_var}, sum=False)
-
-    logp_vals_fn = pytensor.function([A_idx_value_var, I_value_var], A_idx_logp)
-
-    # The compiled graph should not contain any `RandomVariables`
-    assert_no_rvs(logp_vals_fn.maker.fgraph.outputs[0])
-
-    decimals = 6 if pytensor.config.floatX == "float64" else 4
-
-    test_val_rng = np.random.RandomState(3238)
-
-    for i in range(10):
-        bern_sp = sp.bernoulli(p)
-        I_value = bern_sp.rvs(size=size, random_state=test_val_rng).astype(I_rv.dtype)
-
-        norm_sp = sp.norm(mu[I_value, np.ogrid[mu.shape[1] :]], sigma)
-        A_idx_value = norm_sp.rvs(random_state=test_val_rng).astype(A_idx.dtype)
-
-        exp_obs_logps = norm_sp.logpdf(A_idx_value)
-        exp_obs_logps += bern_sp.logpmf(I_value)
-
-        logp_vals = logp_vals_fn(A_idx_value, I_value)
-
-        np.testing.assert_almost_equal(logp_vals, exp_obs_logps, decimal=decimals)
-
-
 def test_persist_inputs():
     """Make sure we don't unnecessarily clone variables."""
-    x = at.scalar("x")
-    beta_rv = at.random.normal(0, 1, name="beta")
-    Y_rv = at.random.normal(beta_rv * x, 1, name="y")
+    x = pt.scalar("x")
+    beta_rv = pt.random.normal(0, 1, name="beta")
+    Y_rv = pt.random.normal(beta_rv * x, 1, name="y")
 
     beta_vv = beta_rv.type()
     y_vv = Y_rv.clone()
 
-    logp = joint_logprob({beta_rv: beta_vv, Y_rv: y_vv})
+    logp = factorized_joint_logprob({beta_rv: beta_vv, Y_rv: y_vv})
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
 
-    assert x in ancestors([logp])
+    assert x in ancestors([logp_combined])
 
     # Make sure we don't clone value variables when they're graphs.
     y_vv_2 = y_vv * 2
-    logp_2 = joint_logprob({beta_rv: beta_vv, Y_rv: y_vv_2})
+    logp_2 = factorized_joint_logprob({beta_rv: beta_vv, Y_rv: y_vv_2})
+    logp_2_combined = pt.sum([pt.sum(factor) for factor in logp_2.values()])
 
-    assert y_vv_2 in ancestors([logp_2])
+    assert y_vv in ancestors([logp_2_combined])
+    assert y_vv_2 in ancestors([logp_2_combined])
 
+    # Even when they are random
+    y_vv = pt.random.normal(name="y_vv2")
+    y_vv_2 = y_vv * 2
+    logp_2 = factorized_joint_logprob({beta_rv: beta_vv, Y_rv: y_vv_2})
+    logp_2_combined = pt.sum([pt.sum(factor) for factor in logp_2.values()])
 
-def test_warn_random_not_found():
-    x_rv = at.random.normal(name="x")
-    y_rv = at.random.normal(x_rv, 1, name="y")
+    assert y_vv in ancestors([logp_2_combined])
+    assert y_vv_2 in ancestors([logp_2_combined])
+
+
+def test_warn_random_found_factorized_joint_logprob():
+    x_rv = pt.random.normal(name="x")
+    y_rv = pt.random.normal(x_rv, 1, name="y")
 
     y_vv = y_rv.clone()
 
-    with pytest.warns(UserWarning):
+    with pytest.warns(UserWarning, match="Found a random variable that was neither among"):
         factorized_joint_logprob({y_rv: y_vv})
 
     with warnings.catch_warnings():
@@ -267,14 +232,14 @@ def test_warn_random_not_found():
 
 
 def test_multiple_rvs_to_same_value_raises():
-    x_rv1 = at.random.normal(name="x1")
-    x_rv2 = at.random.normal(name="x2")
+    x_rv1 = pt.random.normal(name="x1")
+    x_rv2 = pt.random.normal(name="x2")
     x = x_rv1.type()
     x.name = "x"
 
     msg = "More than one logprob factor was assigned to the value var x"
     with pytest.raises(ValueError, match=msg):
-        joint_logprob({x_rv1: x, x_rv2: x})
+        factorized_joint_logprob({x_rv1: x, x_rv2: x})
 
 
 def test_joint_logp_basic():
@@ -332,7 +297,7 @@ def test_joint_logp_incsubtensor(indices, size):
     a_value_var = a.type()
     a.name = "a"
 
-    a_idx = at.set_subtensor(a[indices], data)
+    a_idx = pt.set_subtensor(a[indices], data)
 
     assert isinstance(a_idx.owner.op, (IncSubtensor, AdvancedIncSubtensor, AdvancedIncSubtensor1))
 
@@ -357,32 +322,6 @@ def test_joint_logp_incsubtensor(indices, size):
     np.testing.assert_almost_equal(logp_vals, exp_obs_logps)
 
 
-def test_logp_helper():
-    value = at.vector("value")
-    x = pm.Normal.dist(0, 1)
-
-    x_logp = pm.logp(x, value)
-    np.testing.assert_almost_equal(x_logp.eval({value: [0, 1]}), sp.norm(0, 1).logpdf([0, 1]))
-
-    x_logp = pm.logp(x, [0, 1])
-    np.testing.assert_almost_equal(x_logp.eval(), sp.norm(0, 1).logpdf([0, 1]))
-
-
-def test_logp_helper_derived_rv():
-    assert np.isclose(
-        pm.logp(at.exp(pm.Normal.dist()), 5).eval(),
-        pm.logp(pm.LogNormal.dist(), 5).eval(),
-    )
-
-
-def test_logp_helper_exceptions():
-    with pytest.raises(TypeError, match="When RV is not a pure distribution"):
-        pm.logp(at.exp(pm.Normal.dist()), [1, 2])
-
-    with pytest.raises(NotImplementedError, match="PyMC could not infer logp of input variable"):
-        pm.logp(at.cos(pm.Normal.dist()), 1)
-
-
 def test_model_unchanged_logprob_access():
     # Issue #5007
     with pm.Model() as model:
@@ -430,3 +369,110 @@ def test_hierarchical_obs_logp():
     ops = {a.owner.op for a in logp_ancestors if a.owner}
     assert len(ops) > 0
     assert not any(isinstance(o, RandomVariable) for o in ops)
+
+
+@pytest.mark.parametrize(
+    "func, scipy_func",
+    [
+        (logp, "logpdf"),
+        (logcdf, "logcdf"),
+        (icdf, "ppf"),
+    ],
+)
+def test_probability_direct_dispatch(func, scipy_func):
+    value = pt.vector("value")
+    x = pm.Normal.dist(0, 1)
+
+    np.testing.assert_almost_equal(
+        func(x, value).eval({value: [0, 1]}),
+        getattr(sp.norm(0, 1), scipy_func)([0, 1]),
+    )
+
+    np.testing.assert_almost_equal(
+        func(x, [0, 1]).eval(),
+        getattr(sp.norm(0, 1), scipy_func)([0, 1]),
+    )
+
+
+@pytest.mark.parametrize(
+    "func, scipy_func, test_value",
+    [
+        (logp, "logpdf", 5.0),
+        (logcdf, "logcdf", 5.0),
+        (icdf, "ppf", 0.7),
+    ],
+)
+def test_probability_inference(func, scipy_func, test_value):
+    assert np.isclose(
+        func(pt.exp(pm.Normal.dist()), test_value).eval(),
+        getattr(sp.lognorm(s=1), scipy_func)(test_value),
+    )
+
+
+@pytest.mark.parametrize(
+    "func, func_name",
+    [
+        (logp, "Logprob"),
+        (logcdf, "LogCDF"),
+        (icdf, "Inverse CDF"),
+    ],
+)
+def test_probability_inference_fails(func, func_name):
+    with pytest.raises(
+        NotImplementedError,
+        match=f"{func_name} method not implemented for Elemwise{{cos,no_inplace}}",
+    ):
+        func(pt.cos(pm.Normal.dist()), 1)
+
+
+@pytest.mark.parametrize(
+    "func, scipy_func, test_value",
+    [
+        (logp, "logpdf", 5.0),
+        (logcdf, "logcdf", 5.0),
+        (icdf, "ppf", 0.7),
+    ],
+)
+def test_warn_random_found_probability_inference(func, scipy_func, test_value):
+    # Fail if unexpected warning is issued
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        input_rv = pm.Normal.dist(0, name="input")
+        # Note: This graph could correspond to a convolution of two normals
+        # In which case the inference should either return that or fail explicitly
+        # For now, the lopgrob submodule treats the input as a stochastic value.
+        rv = pt.exp(pm.Normal.dist(input_rv))
+        with pytest.warns(UserWarning, match="RandomVariables were found in the derived graph"):
+            assert func(rv, 0.0)
+
+        res = func(rv, 0.0, warn_missing_rvs=False)
+        # This is the problem we are warning about, as now we can no longer identify the original rv in the graph
+        # or replace it by the respective value
+        assert rv not in ancestors([res])
+
+        # Test that the prescribed solution does not raise a warning and works as expected
+        input_vv = input_rv.clone()
+        [new_rv] = replace_rvs_by_values(
+            [rv],
+            rvs_to_values={input_rv: input_vv},
+            rvs_to_transforms={input_rv: LogTransform()},
+        )
+        input_vv_test = 1.3
+        np.testing.assert_almost_equal(
+            func(new_rv, test_value).eval({input_vv: input_vv_test}),
+            getattr(sp.lognorm(s=1, loc=0, scale=np.exp(np.exp(input_vv_test))), scipy_func)(
+                test_value
+            ),
+        )
+
+
+def test_icdf_discrete():
+    p = 0.1
+    value = 0.9
+    dist = pm.Geometric.dist(p=p)
+    dist_icdf = icdf(dist, value)
+    np.testing.assert_almost_equal(
+        dist_icdf.eval(),
+        sp.geom.ppf(value, p),
+    )
diff --git a/tests/logprob/test_censoring.py b/tests/logprob/test_censoring.py
index a5798e898c..4607dca45f 100644
--- a/tests/logprob/test_censoring.py
+++ b/tests/logprob/test_censoring.py
@@ -36,29 +36,29 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy as sp
 import scipy.stats as st
 
+from pymc import logp
 from pymc.logprob import factorized_joint_logprob
 from pymc.logprob.transforms import LogTransform, TransformValuesRewrite
-from tests.helpers import assert_no_rvs
-from tests.logprob.utils import joint_logprob
+from pymc.testing import assert_no_rvs
 
 
 @pytensor.config.change_flags(compute_test_value="raise")
 def test_continuous_rv_clip():
-    x_rv = at.random.normal(0.5, 1)
-    cens_x_rv = at.clip(x_rv, -2, 2)
+    x_rv = pt.random.normal(0.5, 1)
+    cens_x_rv = pt.clip(x_rv, -2, 2)
 
     cens_x_vv = cens_x_rv.clone()
     cens_x_vv.tag.test_value = 0
 
-    logp = joint_logprob({cens_x_rv: cens_x_vv})
-    assert_no_rvs(logp)
+    logprob = pt.sum(logp(cens_x_rv, cens_x_vv))
+    assert_no_rvs(logprob)
 
-    logp_fn = pytensor.function([cens_x_vv], logp)
+    logp_fn = pytensor.function([cens_x_vv], logprob)
     ref_scipy = st.norm(0.5, 1)
 
     assert logp_fn(-3) == -np.inf
@@ -70,15 +70,15 @@ def test_continuous_rv_clip():
 
 
 def test_discrete_rv_clip():
-    x_rv = at.random.poisson(2)
-    cens_x_rv = at.clip(x_rv, 1, 4)
+    x_rv = pt.random.poisson(2)
+    cens_x_rv = pt.clip(x_rv, 1, 4)
 
     cens_x_vv = cens_x_rv.clone()
 
-    logp = joint_logprob({cens_x_rv: cens_x_vv})
-    assert_no_rvs(logp)
+    logprob = pt.sum(logp(cens_x_rv, cens_x_vv))
+    assert_no_rvs(logprob)
 
-    logp_fn = pytensor.function([cens_x_vv], logp)
+    logp_fn = pytensor.function([cens_x_vv], logprob)
     ref_scipy = st.poisson(2)
 
     assert logp_fn(0) == -np.inf
@@ -90,15 +90,15 @@ def test_discrete_rv_clip():
 
 
 def test_one_sided_clip():
-    x_rv = at.random.normal(0, 1)
-    lb_cens_x_rv = at.clip(x_rv, -1, x_rv)
-    ub_cens_x_rv = at.clip(x_rv, x_rv, 1)
+    x_rv = pt.random.normal(0, 1)
+    lb_cens_x_rv = pt.clip(x_rv, -1, x_rv)
+    ub_cens_x_rv = pt.clip(x_rv, x_rv, 1)
 
     lb_cens_x_vv = lb_cens_x_rv.clone()
     ub_cens_x_vv = ub_cens_x_rv.clone()
 
-    lb_logp = joint_logprob({lb_cens_x_rv: lb_cens_x_vv})
-    ub_logp = joint_logprob({ub_cens_x_rv: ub_cens_x_vv})
+    lb_logp = pt.sum(logp(lb_cens_x_rv, lb_cens_x_vv))
+    ub_logp = pt.sum(logp(ub_cens_x_rv, ub_cens_x_vv))
     assert_no_rvs(lb_logp)
     assert_no_rvs(ub_logp)
 
@@ -112,64 +112,70 @@ def test_one_sided_clip():
 
 
 def test_useless_clip():
-    x_rv = at.random.normal(0.5, 1, size=3)
-    cens_x_rv = at.clip(x_rv, x_rv, x_rv)
+    x_rv = pt.random.normal(0.5, 1, size=3)
+    cens_x_rv = pt.clip(x_rv, x_rv, x_rv)
 
     cens_x_vv = cens_x_rv.clone()
 
-    logp = joint_logprob({cens_x_rv: cens_x_vv}, sum=False)
-    assert_no_rvs(logp)
+    logprob = logp(cens_x_rv, cens_x_vv)
+    assert_no_rvs(logprob)
 
-    logp_fn = pytensor.function([cens_x_vv], logp)
+    logp_fn = pytensor.function([cens_x_vv], logprob)
     ref_scipy = st.norm(0.5, 1)
 
     np.testing.assert_allclose(logp_fn([-2, 0, 2]), ref_scipy.logpdf([-2, 0, 2]))
 
 
 def test_random_clip():
-    lb_rv = at.random.normal(0, 1, size=2)
-    x_rv = at.random.normal(0, 2)
-    cens_x_rv = at.clip(x_rv, lb_rv, [1, 1])
+    lb_rv = pt.random.normal(0, 1, size=2)
+    x_rv = pt.random.normal(0, 2)
+    cens_x_rv = pt.clip(x_rv, lb_rv, [1, 1])
 
     lb_vv = lb_rv.clone()
     cens_x_vv = cens_x_rv.clone()
-    logp = joint_logprob({cens_x_rv: cens_x_vv, lb_rv: lb_vv}, sum=False)
-    assert_no_rvs(logp)
+    logp = factorized_joint_logprob({cens_x_rv: cens_x_vv, lb_rv: lb_vv})
+    logp_combined = pt.add(*logp.values())
 
-    logp_fn = pytensor.function([lb_vv, cens_x_vv], logp)
+    assert_no_rvs(logp_combined)
+
+    logp_fn = pytensor.function([lb_vv, cens_x_vv], logp_combined)
     res = logp_fn([0, -1], [-1, -1])
     assert res[0] == -np.inf
     assert res[1] != -np.inf
 
 
 def test_broadcasted_clip_constant():
-    lb_rv = at.random.uniform(0, 1)
-    x_rv = at.random.normal(0, 2)
-    cens_x_rv = at.clip(x_rv, lb_rv, [1, 1])
+    lb_rv = pt.random.uniform(0, 1)
+    x_rv = pt.random.normal(0, 2)
+    cens_x_rv = pt.clip(x_rv, lb_rv, [1, 1])
 
     lb_vv = lb_rv.clone()
     cens_x_vv = cens_x_rv.clone()
 
-    logp = joint_logprob({cens_x_rv: cens_x_vv, lb_rv: lb_vv})
-    assert_no_rvs(logp)
+    logp = factorized_joint_logprob({cens_x_rv: cens_x_vv, lb_rv: lb_vv})
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
+
+    assert_no_rvs(logp_combined)
 
 
 def test_broadcasted_clip_random():
-    lb_rv = at.random.normal(0, 1)
-    x_rv = at.random.normal(0, 2, size=2)
-    cens_x_rv = at.clip(x_rv, lb_rv, 1)
+    lb_rv = pt.random.normal(0, 1)
+    x_rv = pt.random.normal(0, 2, size=2)
+    cens_x_rv = pt.clip(x_rv, lb_rv, 1)
 
     lb_vv = lb_rv.clone()
     cens_x_vv = cens_x_rv.clone()
 
-    logp = joint_logprob({cens_x_rv: cens_x_vv, lb_rv: lb_vv})
-    assert_no_rvs(logp)
+    logp = factorized_joint_logprob({cens_x_rv: cens_x_vv, lb_rv: lb_vv})
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
+
+    assert_no_rvs(logp_combined)
 
 
 def test_fail_base_and_clip_have_values():
     """Test failure when both base_rv and clipped_rv are given value vars"""
-    x_rv = at.random.normal(0, 1)
-    cens_x_rv = at.clip(x_rv, x_rv, 1)
+    x_rv = pt.random.normal(0, 1)
+    cens_x_rv = pt.clip(x_rv, x_rv, 1)
     cens_x_rv.name = "cens_x"
 
     x_vv = x_rv.clone()
@@ -180,10 +186,10 @@ def test_fail_base_and_clip_have_values():
 
 def test_fail_multiple_clip_single_base():
     """Test failure when multiple clipped_rvs share a single base_rv"""
-    base_rv = at.random.normal(0, 1)
-    cens_rv1 = at.clip(base_rv, -1, 1)
+    base_rv = pt.random.normal(0, 1)
+    cens_rv1 = pt.clip(base_rv, -1, 1)
     cens_rv1.name = "cens1"
-    cens_rv2 = at.clip(base_rv, -1, 1)
+    cens_rv2 = pt.clip(base_rv, -1, 1)
     cens_rv2.name = "cens2"
 
     cens_vv1 = cens_rv1.clone()
@@ -193,16 +199,17 @@ def test_fail_multiple_clip_single_base():
 
 
 def test_deterministic_clipping():
-    x_rv = at.random.normal(0, 1)
-    clip = at.clip(x_rv, 0, 0)
-    y_rv = at.random.normal(clip, 1)
+    x_rv = pt.random.normal(0, 1)
+    clip = pt.clip(x_rv, 0, 0)
+    y_rv = pt.random.normal(clip, 1)
 
     x_vv = x_rv.clone()
     y_vv = y_rv.clone()
-    logp = joint_logprob({x_rv: x_vv, y_rv: y_vv})
-    assert_no_rvs(logp)
+    logp = factorized_joint_logprob({x_rv: x_vv, y_rv: y_vv})
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
+    assert_no_rvs(logp_combined)
 
-    logp_fn = pytensor.function([x_vv, y_vv], logp)
+    logp_fn = pytensor.function([x_vv, y_vv], logp_combined)
     assert np.isclose(
         logp_fn(-1, 1),
         st.norm(0, 1).logpdf(-1) + st.norm(0, 1).logpdf(1),
@@ -210,46 +217,47 @@ def test_deterministic_clipping():
 
 
 def test_clip_transform():
-    x_rv = at.random.normal(0.5, 1)
-    cens_x_rv = at.clip(x_rv, 0, x_rv)
+    x_rv = pt.random.normal(0.5, 1)
+    cens_x_rv = pt.clip(x_rv, 0, x_rv)
 
     cens_x_vv = cens_x_rv.clone()
 
     transform = TransformValuesRewrite({cens_x_vv: LogTransform()})
-    logp = joint_logprob({cens_x_rv: cens_x_vv}, extra_rewrites=transform)
+    logp = factorized_joint_logprob({cens_x_rv: cens_x_vv}, extra_rewrites=transform)
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
 
     cens_x_vv_testval = -1
-    obs_logp = logp.eval({cens_x_vv: cens_x_vv_testval})
+    obs_logp = logp_combined.eval({cens_x_vv: cens_x_vv_testval})
     exp_logp = sp.stats.norm(0.5, 1).logpdf(np.exp(cens_x_vv_testval)) + cens_x_vv_testval
 
     assert np.isclose(obs_logp, exp_logp)
 
 
-@pytest.mark.parametrize("rounding_op", (at.round, at.floor, at.ceil))
+@pytest.mark.parametrize("rounding_op", (pt.round, pt.floor, pt.ceil))
 def test_rounding(rounding_op):
     loc = 1
     scale = 2
     test_value = np.arange(-3, 4)
 
-    x = at.random.normal(loc, scale, size=test_value.shape, name="x")
+    x = pt.random.normal(loc, scale, size=test_value.shape, name="x")
     xr = rounding_op(x)
     xr.name = "xr"
 
     xr_vv = xr.clone()
-    logp = joint_logprob({xr: xr_vv}, sum=False)
-    assert logp is not None
+    logprob = logp(xr, xr_vv)
+    assert logprob is not None
 
     x_sp = st.norm(loc, scale)
-    if rounding_op == at.round:
+    if rounding_op == pt.round:
         expected_logp = np.log(x_sp.cdf(test_value + 0.5) - x_sp.cdf(test_value - 0.5))
-    elif rounding_op == at.floor:
+    elif rounding_op == pt.floor:
         expected_logp = np.log(x_sp.cdf(test_value + 1.0) - x_sp.cdf(test_value))
-    elif rounding_op == at.ceil:
+    elif rounding_op == pt.ceil:
         expected_logp = np.log(x_sp.cdf(test_value) - x_sp.cdf(test_value - 1.0))
     else:
         raise NotImplementedError()
 
     assert np.allclose(
-        logp.eval({xr_vv: test_value}),
+        logprob.eval({xr_vv: test_value}),
         expected_logp,
     )
diff --git a/tests/logprob/test_checks.py b/tests/logprob/test_checks.py
new file mode 100644
index 0000000000..a4e72cda61
--- /dev/null
+++ b/tests/logprob/test_checks.py
@@ -0,0 +1,98 @@
+#   Copyright 2023 The PyMC Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+#   MIT License
+#
+#   Copyright (c) 2021-2022 aesara-devs
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in all
+#   copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#   SOFTWARE.
+
+import numpy as np
+import pytensor
+import pytensor.tensor as pt
+import pytest
+
+from pytensor.raise_op import Assert
+from scipy import stats
+
+from pymc.distributions import Dirichlet
+from pymc.logprob.joint_logprob import factorized_joint_logprob
+from tests.distributions.test_multivariate import dirichlet_logpdf
+
+
+def test_specify_shape_logprob():
+    # 1. Create graph using SpecifyShape
+    # Use symbolic last dimension, so that SpecifyShape is not useless
+    last_dim = pt.scalar(name="last_dim", dtype="int64")
+    x_base = Dirichlet.dist(pt.ones((last_dim,)), shape=(5, last_dim))
+    x_base.name = "x"
+    x_rv = pt.specify_shape(x_base, shape=(5, 3))
+    x_rv.name = "x"
+
+    # 2. Request logp
+    x_vv = x_rv.clone()
+    [x_logp] = factorized_joint_logprob({x_rv: x_vv}).values()
+
+    # 3. Test logp
+    x_logp_fn = pytensor.function([last_dim, x_vv], x_logp)
+
+    # 3.1 Test valid logp
+    x_vv_test = stats.dirichlet(np.ones((3,))).rvs(size=(5,))
+    np.testing.assert_array_almost_equal(
+        x_logp_fn(last_dim=3, x=x_vv_test),
+        dirichlet_logpdf(x_vv_test, np.ones((3,))),
+    )
+
+    # 3.2 Test shape error
+    x_vv_test_invalid = stats.dirichlet(np.ones((1,))).rvs(size=(5,))
+    with pytest.raises(TypeError, match=re.escape("not compatible with the data's ((5, 1))")):
+        x_logp_fn(last_dim=1, x=x_vv_test_invalid)
+
+
+def test_assert_logprob():
+    rv = pt.random.normal()
+    assert_op = Assert("Test assert")
+    # Example: Add assert that rv must be positive
+    assert_rv = assert_op(rv > 0, rv)
+    assert_rv.name = "assert_rv"
+
+    assert_vv = assert_rv.clone()
+    assert_logp = factorized_joint_logprob({assert_rv: assert_vv})[assert_vv]
+
+    # Check valid value is correct and doesn't raise
+    # Since here the value to the rv satisfies the condition, no error is raised.
+    valid_value = 3.0
+    with pytest.raises(AssertionError, match="Test assert"):
+        assert_logp.eval({assert_vv: valid_value})
+
+    # Check invalid value
+    # Since here the value to the rv is negative, an exception is raised as the condition is not met
+    with pytest.raises(AssertionError, match="Test assert"):
+        assert_logp.eval({assert_vv: -5.0})
diff --git a/tests/logprob/test_composite_logprob.py b/tests/logprob/test_composite_logprob.py
index 9f805a62f6..f6419608ce 100644
--- a/tests/logprob/test_composite_logprob.py
+++ b/tests/logprob/test_composite_logprob.py
@@ -36,23 +36,24 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import scipy.stats as st
 
+from pymc import logp
+from pymc.logprob.basic import factorized_joint_logprob
 from pymc.logprob.censoring import MeasurableClip
 from pymc.logprob.rewriting import construct_ir_fgraph
-from tests.helpers import assert_no_rvs
-from tests.logprob.utils import joint_logprob
+from pymc.testing import assert_no_rvs
 
 
 def test_scalar_clipped_mixture():
-    x = at.clip(at.random.normal(loc=1), 0.5, 1.5)
+    x = pt.clip(pt.random.normal(loc=1), 0.5, 1.5)
     x.name = "x"
-    y = at.random.beta(1, 2, name="y")
+    y = pt.random.beta(1, 2, name="y")
 
-    comps = at.stack([x, y])
+    comps = pt.stack([x, y])
     comps.name = "comps"
-    idxs = at.random.bernoulli(0.4, name="idxs")
+    idxs = pt.random.bernoulli(0.4, name="idxs")
     mix = comps[idxs]
     mix.name = "mix"
 
@@ -61,9 +62,10 @@ def test_scalar_clipped_mixture():
     idxs_vv = idxs.clone()
     idxs_vv.name = "idxs_val"
 
-    logp = joint_logprob({idxs: idxs_vv, mix: mix_vv})
+    logp = factorized_joint_logprob({idxs: idxs_vv, mix: mix_vv})
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
 
-    logp_fn = pytensor.function([idxs_vv, mix_vv], logp)
+    logp_fn = pytensor.function([idxs_vv, mix_vv], logp_combined)
     assert logp_fn(0, 0.4) == -np.inf
     assert np.isclose(logp_fn(0, 0.5), st.norm.logcdf(0.5, 1) + np.log(0.6))
     assert np.isclose(logp_fn(0, 1.3), st.norm.logpdf(1.3, 1) + np.log(0.6))
@@ -71,25 +73,25 @@ def test_scalar_clipped_mixture():
 
 
 def test_nested_scalar_mixtures():
-    x = at.random.normal(loc=-50, name="x")
-    y = at.random.normal(loc=50, name="y")
-    comps1 = at.stack([x, y])
+    x = pt.random.normal(loc=-50, name="x")
+    y = pt.random.normal(loc=50, name="y")
+    comps1 = pt.stack([x, y])
     comps1.name = "comps1"
-    idxs1 = at.random.bernoulli(0.5, name="idxs1")
+    idxs1 = pt.random.bernoulli(0.5, name="idxs1")
     mix1 = comps1[idxs1]
     mix1.name = "mix1"
 
-    w = at.random.normal(loc=-100, name="w")
-    z = at.random.normal(loc=100, name="z")
-    comps2 = at.stack([w, z])
+    w = pt.random.normal(loc=-100, name="w")
+    z = pt.random.normal(loc=100, name="z")
+    comps2 = pt.stack([w, z])
     comps2.name = "comps2"
-    idxs2 = at.random.bernoulli(0.5, name="idxs2")
+    idxs2 = pt.random.bernoulli(0.5, name="idxs2")
     mix2 = comps2[idxs2]
     mix2.name = "mix2"
 
-    comps12 = at.stack([mix1, mix2])
+    comps12 = pt.stack([mix1, mix2])
     comps12.name = "comps12"
-    idxs12 = at.random.bernoulli(0.5, name="idxs12")
+    idxs12 = pt.random.bernoulli(0.5, name="idxs12")
     mix12 = comps12[idxs12]
     mix12.name = "mix12"
 
@@ -98,8 +100,12 @@ def test_nested_scalar_mixtures():
     idxs12_vv = idxs12.clone()
     mix12_vv = mix12.clone()
 
-    logp = joint_logprob({idxs1: idxs1_vv, idxs2: idxs2_vv, idxs12: idxs12_vv, mix12: mix12_vv})
-    logp_fn = pytensor.function([idxs1_vv, idxs2_vv, idxs12_vv, mix12_vv], logp)
+    logp = factorized_joint_logprob(
+        {idxs1: idxs1_vv, idxs2: idxs2_vv, idxs12: idxs12_vv, mix12: mix12_vv}
+    )
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
+
+    logp_fn = pytensor.function([idxs1_vv, idxs2_vv, idxs12_vv, mix12_vv], logp_combined)
 
     expected_mu_logpdf = st.norm.logpdf(0) + np.log(0.5) * 3
     assert np.isclose(logp_fn(0, 0, 0, -50), expected_mu_logpdf)
@@ -117,9 +123,9 @@ def test_nested_scalar_mixtures():
 
 def test_unvalued_ir_reversion():
     """Make sure that un-valued IR rewrites are reverted."""
-    x_rv = at.random.normal()
-    y_rv = at.clip(x_rv, 0, 1)
-    z_rv = at.random.normal(y_rv, 1, name="z")
+    x_rv = pt.random.normal()
+    y_rv = pt.clip(x_rv, 0, 1)
+    z_rv = pt.random.normal(y_rv, 1, name="z")
     z_vv = z_rv.clone()
 
     # Only the `z_rv` is "valued", so `y_rv` doesn't need to be converted into
@@ -139,26 +145,26 @@ def test_unvalued_ir_reversion():
 
 
 def test_shifted_cumsum():
-    x = at.random.normal(size=(5,), name="x")
-    y = 5 + at.cumsum(x)
+    x = pt.random.normal(size=(5,), name="x")
+    y = 5 + pt.cumsum(x)
     y.name = "y"
 
     y_vv = y.clone()
-    logp = joint_logprob({y: y_vv})
+    logprob = logp(y, y_vv)
     assert np.isclose(
-        logp.eval({y_vv: np.arange(5) + 1 + 5}),
+        logprob.eval({y_vv: np.arange(5) + 1 + 5}).sum(),
         st.norm.logpdf(1) * 5,
     )
 
 
 def test_double_log_transform_rv():
-    base_rv = at.random.normal(0, 1)
-    y_rv = at.log(at.log(base_rv))
+    base_rv = pt.random.normal(0, 1)
+    y_rv = pt.log(pt.log(base_rv))
     y_rv.name = "y"
 
     y_vv = y_rv.clone()
-    logp = joint_logprob({y_rv: y_vv}, sum=False)
-    logp_fn = pytensor.function([y_vv], logp)
+    logprob = logp(y_rv, y_vv)
+    logp_fn = pytensor.function([y_vv], logprob)
 
     log_log_y_val = np.asarray(0.5)
     log_y_val = np.exp(log_log_y_val)
@@ -170,17 +176,17 @@ def test_double_log_transform_rv():
 
 
 def test_affine_transform_rv():
-    loc = at.scalar("loc")
-    scale = at.vector("scale")
+    loc = pt.scalar("loc")
+    scale = pt.vector("scale")
     rv_size = 3
 
-    y_rv = loc + at.random.normal(0, 1, size=rv_size, name="base_rv") * scale
+    y_rv = loc + pt.random.normal(0, 1, size=rv_size, name="base_rv") * scale
     y_rv.name = "y"
     y_vv = y_rv.clone()
 
-    logp = joint_logprob({y_rv: y_vv}, sum=False)
-    assert_no_rvs(logp)
-    logp_fn = pytensor.function([loc, scale, y_vv], logp)
+    logprob = logp(y_rv, y_vv)
+    assert_no_rvs(logprob)
+    logp_fn = pytensor.function([loc, scale, y_vv], logprob)
 
     loc_test_val = 4.0
     scale_test_val = np.full(rv_size, 0.5)
@@ -193,15 +199,15 @@ def test_affine_transform_rv():
 
 
 def test_affine_log_transform_rv():
-    a, b = at.scalars("a", "b")
-    base_rv = at.random.lognormal(0, 1, name="base_rv", size=(1, 2))
-    y_rv = a + at.log(base_rv) * b
+    a, b = pt.scalars("a", "b")
+    base_rv = pt.random.lognormal(0, 1, name="base_rv", size=(1, 2))
+    y_rv = a + pt.log(base_rv) * b
     y_rv.name = "y"
 
     y_vv = y_rv.clone()
 
-    logp = joint_logprob({y_rv: y_vv}, sum=False)
-    logp_fn = pytensor.function([a, b, y_vv], logp)
+    logprob = logp(y_rv, y_vv)
+    logp_fn = pytensor.function([a, b, y_vv], logprob)
 
     a_val = -1.5
     b_val = 3.0
diff --git a/tests/logprob/test_cumsum.py b/tests/logprob/test_cumsum.py
index 6a25f6d983..94ea39a5fd 100644
--- a/tests/logprob/test_cumsum.py
+++ b/tests/logprob/test_cumsum.py
@@ -36,12 +36,13 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.stats as st
 
-from tests.helpers import assert_no_rvs
-from tests.logprob.utils import joint_logprob
+from pymc import logp
+from pymc.logprob.basic import factorized_joint_logprob
+from pymc.testing import assert_no_rvs
 
 
 @pytest.mark.parametrize(
@@ -57,14 +58,14 @@
     ],
 )
 def test_normal_cumsum(size, axis):
-    rv = at.random.normal(0, 1, size=size).cumsum(axis)
+    rv = pt.random.normal(0, 1, size=size).cumsum(axis)
     vv = rv.clone()
-    logp = joint_logprob({rv: vv})
-    assert_no_rvs(logp)
+    logprob = logp(rv, vv)
+    assert_no_rvs(logprob)
 
     assert np.isclose(
         st.norm(0, 1).logpdf(np.ones(size)).sum(),
-        logp.eval({vv: np.ones(size).cumsum(axis)}),
+        logprob.eval({vv: np.ones(size).cumsum(axis)}).sum(),
     )
 
 
@@ -81,38 +82,40 @@ def test_normal_cumsum(size, axis):
     ],
 )
 def test_bernoulli_cumsum(size, axis):
-    rv = at.random.bernoulli(0.9, size=size).cumsum(axis)
+    rv = pt.random.bernoulli(0.9, size=size).cumsum(axis)
     vv = rv.clone()
-    logp = joint_logprob({rv: vv})
-    assert_no_rvs(logp)
+    logprob = logp(rv, vv)
+    assert_no_rvs(logprob)
 
     assert np.isclose(
         st.bernoulli(0.9).logpmf(np.ones(size)).sum(),
-        logp.eval({vv: np.ones(size, int).cumsum(axis)}),
+        logprob.eval({vv: np.ones(size, int).cumsum(axis)}).sum(),
     )
 
 
 def test_destructive_cumsum_fails():
     """Test that a cumsum that mixes dimensions fails"""
-    x_rv = at.random.normal(size=(2, 2, 2)).cumsum()
+    x_rv = pt.random.normal(size=(2, 2, 2)).cumsum()
     x_vv = x_rv.clone()
     with pytest.raises(RuntimeError, match="could not be derived"):
-        joint_logprob({x_rv: x_vv})
+        factorized_joint_logprob({x_rv: x_vv})
 
 
 def test_deterministic_cumsum():
     """Test that deterministic cumsum is not affected"""
-    x_rv = at.random.normal(1, 1, size=5)
-    cumsum_x_rv = at.cumsum(x_rv)
-    y_rv = at.random.normal(cumsum_x_rv, 1)
+    x_rv = pt.random.normal(1, 1, size=5)
+    cumsum_x_rv = pt.cumsum(x_rv)
+    y_rv = pt.random.normal(cumsum_x_rv, 1)
 
     x_vv = x_rv.clone()
     y_vv = y_rv.clone()
-    logp = joint_logprob({x_rv: x_vv, y_rv: y_vv})
-    assert_no_rvs(logp)
 
-    logp_fn = pytensor.function([x_vv, y_vv], logp)
+    logp = factorized_joint_logprob({x_rv: x_vv, y_rv: y_vv})
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
+    assert_no_rvs(logp_combined)
+
+    logp_fn = pytensor.function([x_vv, y_vv], logp_combined)
     assert np.isclose(
-        logp_fn(np.ones(5), np.arange(5) + 1),
+        logp_fn(np.ones(5), np.arange(5) + 1).sum(),
         st.norm(1, 1).logpdf(1) * 10,
     )
diff --git a/tests/logprob/test_mixture.py b/tests/logprob/test_mixture.py
index bad48d5d11..465ecabe79 100644
--- a/tests/logprob/test_mixture.py
+++ b/tests/logprob/test_mixture.py
@@ -36,31 +36,38 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.stats.distributions as sp
 
+from pytensor import function
 from pytensor.graph.basic import Variable, equal_computations
+from pytensor.ifelse import ifelse
 from pytensor.tensor.random.basic import CategoricalRV
 from pytensor.tensor.shape import shape_tuple
-from pytensor.tensor.subtensor import as_index_constant
+from pytensor.tensor.subtensor import (
+    AdvancedSubtensor,
+    AdvancedSubtensor1,
+    Subtensor,
+    as_index_constant,
+)
 
-from pymc.logprob.joint_logprob import factorized_joint_logprob
+from pymc.logprob.basic import factorized_joint_logprob
 from pymc.logprob.mixture import MixtureRV, expand_indices
 from pymc.logprob.rewriting import construct_ir_fgraph
 from pymc.logprob.utils import dirac_delta
-from tests.helpers import assert_no_rvs
-from tests.logprob.utils import joint_logprob, scipy_logprob
+from pymc.testing import assert_no_rvs
+from tests.logprob.utils import scipy_logprob
 
 
 def test_mixture_basics():
-    srng = at.random.RandomStream(29833)
+    srng = pt.random.RandomStream(29833)
 
     def create_mix_model(size, axis):
         X_rv = srng.normal(0, 1, size=size, name="X")
         Y_rv = srng.gamma(0.5, 0.5, size=size, name="Y")
 
-        p_at = at.scalar("p")
+        p_at = pt.scalar("p")
         p_at.tag.test_value = 0.5
 
         I_rv = srng.bernoulli(p_at, size=size, name="I")
@@ -68,9 +75,9 @@ def create_mix_model(size, axis):
         i_vv.name = "i"
 
         if isinstance(axis, Variable):
-            M_rv = at.join(axis, X_rv, Y_rv)[I_rv]
+            M_rv = pt.join(axis, X_rv, Y_rv)[I_rv]
         else:
-            M_rv = at.stack([X_rv, Y_rv], axis=axis)[I_rv]
+            M_rv = pt.stack([X_rv, Y_rv], axis=axis)[I_rv]
 
         M_rv.name = "M"
         m_vv = M_rv.clone()
@@ -92,31 +99,31 @@ def create_mix_model(size, axis):
         factorized_joint_logprob({M_rv: m_vv, I_rv: i_vv, X_rv: x_vv})
 
     with pytest.raises(RuntimeError, match="could not be derived: {m}"):
-        axis_at = at.lscalar("axis")
+        axis_at = pt.lscalar("axis")
         axis_at.tag.test_value = 0
         env = create_mix_model((2,), axis_at)
         I_rv = env["I_rv"]
         i_vv = env["i_vv"]
         M_rv = env["M_rv"]
         m_vv = env["m_vv"]
-        joint_logprob({M_rv: m_vv, I_rv: i_vv})
+        factorized_joint_logprob({M_rv: m_vv, I_rv: i_vv})
 
 
 @pytensor.config.change_flags(compute_test_value="warn")
 @pytest.mark.parametrize(
     "op_constructor",
     [
-        lambda _I, _X, _Y: at.stack([_X, _Y])[_I],
-        lambda _I, _X, _Y: at.switch(_I, _X, _Y),
+        lambda _I, _X, _Y: pt.stack([_X, _Y])[_I],
+        lambda _I, _X, _Y: pt.switch(_I, _X, _Y),
     ],
 )
 def test_compute_test_value(op_constructor):
-    srng = at.random.RandomStream(29833)
+    srng = pt.random.RandomStream(29833)
 
     X_rv = srng.normal(0, 1, name="X")
     Y_rv = srng.gamma(0.5, 0.5, name="Y")
 
-    p_at = at.scalar("p")
+    p_at = pt.scalar("p")
     p_at.tag.test_value = 0.3
 
     I_rv = srng.bernoulli(p_at, name="I")
@@ -132,9 +139,10 @@ def test_compute_test_value(op_constructor):
 
     del M_rv.tag.test_value
 
-    M_logp = joint_logprob({M_rv: m_vv, I_rv: i_vv}, sum=False)
+    M_logp = factorized_joint_logprob({M_rv: m_vv, I_rv: i_vv})
+    M_logp_combined = pt.add(*M_logp.values())
 
-    assert isinstance(M_logp.tag.test_value, np.ndarray)
+    assert isinstance(M_logp_combined.tag.test_value, np.ndarray)
 
 
 @pytest.mark.parametrize(
@@ -151,18 +159,18 @@ def test_compute_test_value(op_constructor):
     ],
 )
 def test_hetero_mixture_binomial(p_val, size, supported):
-    srng = at.random.RandomStream(29833)
+    srng = pt.random.RandomStream(29833)
 
     X_rv = srng.normal(0, 1, size=size, name="X")
     Y_rv = srng.gamma(0.5, 0.5, size=size, name="Y")
 
     if np.ndim(p_val) == 0:
-        p_at = at.scalar("p")
+        p_at = pt.scalar("p")
         p_at.tag.test_value = p_val
         I_rv = srng.bernoulli(p_at, size=size, name="I")
         p_val_1 = p_val
     else:
-        p_at = at.vector("p")
+        p_at = pt.vector("p")
         p_at.tag.test_value = np.array(p_val, dtype=pytensor.config.floatX)
         I_rv = srng.categorical(p_at, size=size, name="I")
         p_val_1 = p_val[1]
@@ -170,20 +178,21 @@ def test_hetero_mixture_binomial(p_val, size, supported):
     i_vv = I_rv.clone()
     i_vv.name = "i"
 
-    M_rv = at.stack([X_rv, Y_rv])[I_rv]
+    M_rv = pt.stack([X_rv, Y_rv])[I_rv]
     M_rv.name = "M"
 
     m_vv = M_rv.clone()
     m_vv.name = "m"
 
     if supported:
-        M_logp = joint_logprob({M_rv: m_vv, I_rv: i_vv}, sum=False)
+        M_logp = factorized_joint_logprob({M_rv: m_vv, I_rv: i_vv})
+        M_logp_combined = pt.add(*M_logp.values())
     else:
         with pytest.raises(RuntimeError, match="could not be derived: {m}"):
-            joint_logprob({M_rv: m_vv, I_rv: i_vv}, sum=False)
+            factorized_joint_logprob({M_rv: m_vv, I_rv: i_vv})
         return
 
-    M_logp_fn = pytensor.function([p_at, m_vv, i_vv], M_logp)
+    M_logp_fn = pytensor.function([p_at, m_vv, i_vv], M_logp_combined)
 
     assert_no_rvs(M_logp_fn.maker.fgraph.outputs[0])
 
@@ -560,13 +569,13 @@ def test_hetero_mixture_binomial(p_val, size, supported):
 def test_hetero_mixture_categorical(
     X_args, Y_args, Z_args, p_val, comp_size, idx_size, extra_indices, join_axis, supported
 ):
-    srng = at.random.RandomStream(29833)
+    srng = pt.random.RandomStream(29833)
 
     X_rv = srng.normal(*X_args, size=comp_size, name="X")
     Y_rv = srng.gamma(*Y_args, size=comp_size, name="Y")
     Z_rv = srng.normal(*Z_args, size=comp_size, name="Z")
 
-    p_at = at.as_tensor(p_val).type()
+    p_at = pt.as_tensor(p_val).type()
     p_at.name = "p"
     p_at.tag.test_value = np.array(p_val, dtype=pytensor.config.floatX)
     I_rv = srng.categorical(p_at, size=idx_size, name="I")
@@ -578,7 +587,7 @@ def test_hetero_mixture_categorical(
     indices_at.insert(join_axis, I_rv)
     indices_at = tuple(indices_at)
 
-    M_rv = at.stack([X_rv, Y_rv, Z_rv], axis=join_axis)[indices_at]
+    M_rv = pt.stack([X_rv, Y_rv, Z_rv], axis=join_axis)[indices_at]
     M_rv.name = "M"
 
     m_vv = M_rv.clone()
@@ -727,7 +736,7 @@ def test_hetero_mixture_categorical(
     ],
 )
 def test_expand_indices_basic(A_parts, indices):
-    A = at.stack(A_parts)
+    A = pt.stack(A_parts)
     at_indices = [as_index_constant(idx) for idx in indices]
     full_indices = expand_indices(at_indices, shape_tuple(A))
     assert len(full_indices) == A.ndim
@@ -764,7 +773,7 @@ def test_expand_indices_basic(A_parts, indices):
     ],
 )
 def test_expand_indices_moved_subspaces(A_parts, indices):
-    A = at.stack(A_parts)
+    A = pt.stack(A_parts)
     at_indices = [as_index_constant(idx) for idx in indices]
     full_indices = expand_indices(at_indices, shape_tuple(A))
     assert len(full_indices) == A.ndim
@@ -811,7 +820,7 @@ def test_expand_indices_moved_subspaces(A_parts, indices):
     ],
 )
 def test_expand_indices_single_indices(A_parts, indices):
-    A = at.stack(A_parts)
+    A = pt.stack(A_parts)
     at_indices = [as_index_constant(idx) for idx in indices]
     full_indices = expand_indices(at_indices, shape_tuple(A))
     assert len(full_indices) == A.ndim
@@ -866,7 +875,7 @@ def test_expand_indices_single_indices(A_parts, indices):
     ],
 )
 def test_expand_indices_newaxis(A_parts, indices):
-    A = at.stack(A_parts)
+    A = pt.stack(A_parts)
     at_indices = [as_index_constant(idx) for idx in indices]
     full_indices = expand_indices(at_indices, shape_tuple(A))
     assert len(full_indices) == A.ndim
@@ -876,7 +885,7 @@ def test_expand_indices_newaxis(A_parts, indices):
 
 
 def test_mixture_with_DiracDelta():
-    srng = at.random.RandomStream(29833)
+    srng = pt.random.RandomStream(29833)
 
     X_rv = srng.normal(0, 1, name="X")
     Y_rv = dirac_delta(0.0)
@@ -887,7 +896,7 @@ def test_mixture_with_DiracDelta():
     i_vv = I_rv.clone()
     i_vv.name = "i"
 
-    M_rv = at.stack([X_rv, Y_rv])[I_rv]
+    M_rv = pt.stack([X_rv, Y_rv])[I_rv]
     M_rv.name = "M"
 
     m_vv = M_rv.clone()
@@ -899,7 +908,7 @@ def test_mixture_with_DiracDelta():
 
 
 def test_switch_mixture():
-    srng = at.random.RandomStream(29833)
+    srng = pt.random.RandomStream(29833)
 
     X_rv = srng.normal(-10.0, 0.1, name="X")
     Y_rv = srng.normal(10.0, 0.1, name="Y")
@@ -908,7 +917,7 @@ def test_switch_mixture():
     i_vv = I_rv.clone()
     i_vv.name = "i"
 
-    Z1_rv = at.switch(I_rv, X_rv, Y_rv)
+    Z1_rv = pt.switch(I_rv, X_rv, Y_rv)
     z_vv = Z1_rv.clone()
     z_vv.name = "z1"
 
@@ -928,17 +937,180 @@ def test_switch_mixture():
 
     # building the identical graph but with a stack to check that mixture computations are identical
 
-    Z2_rv = at.stack((X_rv, Y_rv))[I_rv]
+    Z2_rv = pt.stack((X_rv, Y_rv))[I_rv]
 
     fgraph2, _, _ = construct_ir_fgraph({Z2_rv: z_vv, I_rv: i_vv})
 
     assert equal_computations(fgraph.outputs, fgraph2.outputs)
 
-    z1_logp = joint_logprob({Z1_rv: z_vv, I_rv: i_vv})
-    z2_logp = joint_logprob({Z2_rv: z_vv, I_rv: i_vv})
+    z1_logp = factorized_joint_logprob({Z1_rv: z_vv, I_rv: i_vv})
+    z2_logp = factorized_joint_logprob({Z2_rv: z_vv, I_rv: i_vv})
+    z1_logp_combined = pt.sum([pt.sum(factor) for factor in z1_logp.values()])
+    z2_logp_combined = pt.sum([pt.sum(factor) for factor in z2_logp.values()])
 
     # below should follow immediately from the equal_computations assertion above
-    assert equal_computations([z1_logp], [z2_logp])
+    assert equal_computations([z1_logp_combined], [z2_logp_combined])
+
+    np.testing.assert_almost_equal(0.69049938, z1_logp_combined.eval({z_vv: -10, i_vv: 0}))
+    np.testing.assert_almost_equal(0.69049938, z2_logp_combined.eval({z_vv: -10, i_vv: 0}))
+
+
+def test_ifelse_mixture_one_component():
+    if_rv = pt.random.bernoulli(0.5, name="if")
+    scale_rv = pt.random.halfnormal(name="scale")
+    comp_then = pt.random.normal(0, scale_rv, size=(2,), name="comp_then")
+    comp_else = pt.random.halfnormal(0, scale_rv, size=(4,), name="comp_else")
+    mix_rv = ifelse(if_rv, comp_then, comp_else, name="mix")
+
+    if_vv = if_rv.clone()
+    scale_vv = scale_rv.clone()
+    mix_vv = mix_rv.clone()
+    mix_logp = factorized_joint_logprob({if_rv: if_vv, scale_rv: scale_vv, mix_rv: mix_vv})[mix_vv]
+    assert_no_rvs(mix_logp)
+
+    fn = function([if_vv, scale_vv, mix_vv], mix_logp)
+    scale_vv_test = 0.75
+    mix_vv_test = np.r_[1.0, 2.5]
+    np.testing.assert_array_almost_equal(
+        fn(1, scale_vv_test, mix_vv_test),
+        sp.norm(0, scale_vv_test).logpdf(mix_vv_test),
+    )
+    mix_vv_test = np.r_[1.0, 2.5, 3.5, 4.0]
+    np.testing.assert_array_almost_equal(
+        fn(0, scale_vv_test, mix_vv_test), sp.halfnorm(0, scale_vv_test).logpdf(mix_vv_test)
+    )
+
+
+def test_ifelse_mixture_multiple_components():
+    rng = np.random.default_rng(968)
+
+    if_var = pt.scalar("if_var", dtype="bool")
+    comp_then1 = pt.random.normal(size=(2,), name="comp_true1")
+    comp_then2 = pt.random.normal(comp_then1, size=(2, 2), name="comp_then2")
+    comp_else1 = pt.random.halfnormal(size=(4,), name="comp_else1")
+    comp_else2 = pt.random.halfnormal(size=(4, 4), name="comp_else2")
+
+    mix_rv1, mix_rv2 = ifelse(
+        if_var, [comp_then1, comp_then2], [comp_else1, comp_else2], name="mix"
+    )
+    mix_vv1 = mix_rv1.clone()
+    mix_vv2 = mix_rv2.clone()
+    mix_logp1, mix_logp2 = factorized_joint_logprob({mix_rv1: mix_vv1, mix_rv2: mix_vv2}).values()
+    assert_no_rvs(mix_logp1)
+    assert_no_rvs(mix_logp2)
+
+    fn = function([if_var, mix_vv1, mix_vv2], mix_logp1.sum() + mix_logp2.sum())
+    mix_vv1_test = np.abs(rng.normal(size=(2,)))
+    mix_vv2_test = np.abs(rng.normal(size=(2, 2)))
+    np.testing.assert_almost_equal(
+        fn(True, mix_vv1_test, mix_vv2_test),
+        sp.norm(0, 1).logpdf(mix_vv1_test).sum()
+        + sp.norm(mix_vv1_test, 1).logpdf(mix_vv2_test).sum(),
+    )
+    mix_vv1_test = np.abs(rng.normal(size=(4,)))
+    mix_vv2_test = np.abs(rng.normal(size=(4, 4)))
+    np.testing.assert_almost_equal(
+        fn(False, mix_vv1_test, mix_vv2_test),
+        sp.halfnorm(0, 1).logpdf(mix_vv1_test).sum() + sp.halfnorm(0, 1).logpdf(mix_vv2_test).sum(),
+    )
+
+
+def test_ifelse_mixture_shared_component():
+    rng = np.random.default_rng(1009)
+
+    if_var = pt.scalar("if_var", dtype="bool")
+    outer_rv = pt.random.normal(name="outer")
+    # comp_shared need not be an output of ifelse at all,
+    # but since we allow arbitrary graphs we test it works as expected.
+    comp_shared = pt.random.normal(size=(2,), name="comp_shared")
+    comp_then = outer_rv + pt.random.normal(comp_shared, 1, size=(4, 2), name="comp_then")
+    comp_else = outer_rv + pt.random.normal(comp_shared, 10, size=(8, 2), name="comp_else")
+    shared_rv, mix_rv = ifelse(
+        if_var, [comp_shared, comp_then], [comp_shared, comp_else], name="mix"
+    )
+
+    outer_vv = outer_rv.clone()
+    shared_vv = shared_rv.clone()
+    mix_vv = mix_rv.clone()
+    outer_logp, mix_logp1, mix_logp2 = factorized_joint_logprob(
+        {outer_rv: outer_vv, shared_rv: shared_vv, mix_rv: mix_vv}
+    ).values()
+    assert_no_rvs(outer_logp)
+    assert_no_rvs(mix_logp1)
+    assert_no_rvs(mix_logp2)
+
+    fn = function([if_var, outer_vv, shared_vv, mix_vv], mix_logp1.sum() + mix_logp2.sum())
+    outer_vv_test = rng.normal()
+    shared_vv_test = rng.normal(size=(2,))
+    mix_vv_test = rng.normal(size=(4, 2))
+    np.testing.assert_almost_equal(
+        fn(True, outer_vv_test, shared_vv_test, mix_vv_test),
+        (
+            sp.norm(0, 1).logpdf(shared_vv_test).sum()
+            + sp.norm(outer_vv_test + shared_vv_test, 1).logpdf(mix_vv_test).sum()
+        ),
+    )
+    mix_vv_test = rng.normal(size=(8, 2))
+    np.testing.assert_almost_equal(
+        fn(False, outer_vv_test, shared_vv_test, mix_vv_test),
+        (
+            sp.norm(0, 1).logpdf(shared_vv_test).sum()
+            + sp.norm(outer_vv_test + shared_vv_test, 10).logpdf(mix_vv_test).sum()
+        ),
+        decimal=6,
+    )
+
+
+def test_joint_logprob_subtensor():
+    """Make sure we can compute a joint log-probability for ``Y[I]`` where ``Y`` and ``I`` are random variables."""
+
+    size = 5
+
+    mu_base = np.power(10, np.arange(np.prod(size))).reshape(size)
+    mu = np.stack([mu_base, -mu_base])
+    sigma = 0.001
+    rng = pytensor.shared(np.random.RandomState(232), borrow=True)
+
+    A_rv = pt.random.normal(mu, sigma, rng=rng)
+    A_rv.name = "A"
+
+    p = 0.5
+
+    I_rv = pt.random.bernoulli(p, size=size, rng=rng)
+    I_rv.name = "I"
+
+    A_idx = A_rv[I_rv, pt.ogrid[A_rv.shape[-1] :]]
+
+    assert isinstance(A_idx.owner.op, (Subtensor, AdvancedSubtensor, AdvancedSubtensor1))
+
+    A_idx_value_var = A_idx.type()
+    A_idx_value_var.name = "A_idx_value"
 
-    np.testing.assert_almost_equal(0.69049938, z1_logp.eval({z_vv: -10, i_vv: 0}))
-    np.testing.assert_almost_equal(0.69049938, z2_logp.eval({z_vv: -10, i_vv: 0}))
+    I_value_var = I_rv.type()
+    I_value_var.name = "I_value"
+
+    A_idx_logp = factorized_joint_logprob({A_idx: A_idx_value_var, I_rv: I_value_var})
+    A_idx_logp_comb = pt.add(*A_idx_logp.values())
+
+    logp_vals_fn = pytensor.function([A_idx_value_var, I_value_var], A_idx_logp_comb)
+
+    # The compiled graph should not contain any `RandomVariables`
+    assert_no_rvs(logp_vals_fn.maker.fgraph.outputs[0])
+
+    decimals = 6 if pytensor.config.floatX == "float64" else 4
+
+    test_val_rng = np.random.RandomState(3238)
+
+    for i in range(10):
+        bern_sp = sp.bernoulli(p)
+        I_value = bern_sp.rvs(size=size, random_state=test_val_rng).astype(I_rv.dtype)
+
+        norm_sp = sp.norm(mu[I_value, np.ogrid[mu.shape[1] :]], sigma)
+        A_idx_value = norm_sp.rvs(random_state=test_val_rng).astype(A_idx.dtype)
+
+        exp_obs_logps = norm_sp.logpdf(A_idx_value)
+        exp_obs_logps += bern_sp.logpmf(I_value)
+
+        logp_vals = logp_vals_fn(A_idx_value, I_value)
+
+        np.testing.assert_almost_equal(logp_vals, exp_obs_logps, decimal=decimals)
diff --git a/tests/logprob/test_rewriting.py b/tests/logprob/test_rewriting.py
index a3ec1e6c4e..b8836bbce5 100644
--- a/tests/logprob/test_rewriting.py
+++ b/tests/logprob/test_rewriting.py
@@ -36,7 +36,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.stats.distributions as sp
 
@@ -50,16 +50,16 @@
     Subtensor,
 )
 
+from pymc.logprob.basic import factorized_joint_logprob
 from pymc.logprob.rewriting import local_lift_DiracDelta
 from pymc.logprob.utils import DiracDelta, dirac_delta
-from tests.logprob.utils import joint_logprob
 
 
 def test_local_lift_DiracDelta():
-    c_at = at.vector()
+    c_at = pt.vector()
     dd_at = dirac_delta(c_at)
 
-    Z_at = at.cast(dd_at, "int64")
+    Z_at = pt.cast(dd_at, "int64")
 
     res = rewrite_graph(Z_at, custom_rewrite=in2out(local_lift_DiracDelta), clone=False)
     assert isinstance(res.owner.op, DiracDelta)
@@ -78,16 +78,16 @@ def test_local_lift_DiracDelta():
     assert isinstance(res.owner.inputs[0].owner.op, Subtensor)
 
     # Don't lift multi-output `Op`s
-    c_at = at.matrix()
+    c_at = pt.matrix()
     dd_at = dirac_delta(c_at)
-    Z_at = at.nlinalg.svd(dd_at)[0]
+    Z_at = pt.nlinalg.svd(dd_at)[0]
 
     res = rewrite_graph(Z_at, custom_rewrite=in2out(local_lift_DiracDelta), clone=False)
     assert res is Z_at
 
 
 def test_local_remove_DiracDelta():
-    c_at = at.vector()
+    c_at = pt.vector()
     dd_at = dirac_delta(c_at)
 
     fn = pytensor.function([c_at], dd_at)
@@ -112,17 +112,18 @@ def test_joint_logprob_incsubtensor(indices, size):
     data = rng.normal(mu[indices], 1.0)
     y_val = rng.normal(mu, sigma, size=size)
 
-    Y_base_rv = at.random.normal(mu, sigma, size=size)
-    Y_rv = at.set_subtensor(Y_base_rv[indices], data)
+    Y_base_rv = pt.random.normal(mu, sigma, size=size)
+    Y_rv = pt.set_subtensor(Y_base_rv[indices], data)
     Y_rv.name = "Y"
     y_value_var = Y_rv.clone()
     y_value_var.name = "y"
 
     assert isinstance(Y_rv.owner.op, (IncSubtensor, AdvancedIncSubtensor, AdvancedIncSubtensor1))
 
-    Y_rv_logp = joint_logprob({Y_rv: y_value_var}, sum=False)
+    Y_rv_logp = factorized_joint_logprob({Y_rv: y_value_var})
+    Y_rv_logp_combined = pt.add(*Y_rv_logp.values())
 
-    obs_logps = Y_rv_logp.eval({y_value_var: y_val})
+    obs_logps = Y_rv_logp_combined.eval({y_value_var: y_val})
 
     y_val_idx = y_val.copy()
     y_val_idx[indices] = data
diff --git a/tests/logprob/test_scan.py b/tests/logprob/test_scan.py
index 0b4577b1a9..748a4405fc 100644
--- a/tests/logprob/test_scan.py
+++ b/tests/logprob/test_scan.py
@@ -36,22 +36,22 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from pytensor import Mode
 from pytensor.raise_op import assert_op
 from pytensor.scan.utils import ScanArgs
+from scipy import stats
 
-from pymc.logprob.abstract import logprob
-from pymc.logprob.joint_logprob import factorized_joint_logprob
+from pymc.logprob.abstract import _logprob_helper
+from pymc.logprob.basic import factorized_joint_logprob, logp
 from pymc.logprob.scan import (
     construct_scan,
     convert_outer_out_to_in,
     get_random_outer_outputs,
 )
-from tests.helpers import assert_no_rvs
-from tests.logprob.utils import joint_logprob
+from pymc.testing import assert_no_rvs
 
 
 def create_inner_out_logp(value_map):
@@ -62,7 +62,7 @@ def create_inner_out_logp(value_map):
     """
     res = []
     for old_inner_out_var, new_inner_in_var in value_map.items():
-        logp = logprob(old_inner_out_var, new_inner_in_var)
+        logp = _logprob_helper(old_inner_out_var, new_inner_in_var)
         if new_inner_in_var.name:
             logp.name = f"logp({new_inner_in_var.name})"
         res.append(logp)
@@ -92,17 +92,17 @@ def input_step_fn(mu_tm1, y_tm1, rng):
         y_tm1.name = "y_tm1"
         mu = mu_tm1 + y_tm1 + 1
         mu.name = "mu_t"
-        return mu, at.random.normal(mu, 1.0, rng=rng, name="Y_t")
+        return mu, pt.random.normal(mu, 1.0, rng=rng, name="Y_t")
 
     (mu_tt, Y_rv), _ = pytensor.scan(
         fn=input_step_fn,
         outputs_info=[
             {
-                "initial": at.as_tensor_variable(0.0, dtype=pytensor.config.floatX),
+                "initial": pt.as_tensor_variable(0.0, dtype=pytensor.config.floatX),
                 "taps": [-1],
             },
             {
-                "initial": at.as_tensor_variable(0.0, dtype=pytensor.config.floatX),
+                "initial": pt.as_tensor_variable(0.0, dtype=pytensor.config.floatX),
                 "taps": [-1],
             },
         ],
@@ -125,7 +125,7 @@ def input_step_fn(mu_tm1, y_tm1, rng):
     # Sample from the model and create another `Scan` that computes the
     # log-likelihood of the model at the sampled point.
     #
-    Y_obs = at.as_tensor_variable(Y_rv.eval())
+    Y_obs = pt.as_tensor_variable(Y_rv.eval())
     Y_obs.name = "Y_obs"
 
     def output_step_fn(y_t, y_tm1, mu_tm1):
@@ -133,7 +133,7 @@ def output_step_fn(y_t, y_tm1, mu_tm1):
         y_tm1.name = "y_tm1"
         mu = mu_tm1 + y_tm1 + 1
         mu.name = "mu_t"
-        logp = logprob(at.random.normal(mu, 1.0), y_t)
+        logp = _logprob_helper(pt.random.normal(mu, 1.0), y_t)
         logp.name = "logp"
         return mu, logp
 
@@ -142,7 +142,7 @@ def output_step_fn(y_t, y_tm1, mu_tm1):
         sequences=[{"input": Y_obs, "taps": [0, -1]}],
         outputs_info=[
             {
-                "initial": at.as_tensor_variable(0.0, dtype=pytensor.config.floatX),
+                "initial": pt.as_tensor_variable(0.0, dtype=pytensor.config.floatX),
                 "taps": [-1],
             },
             {},
@@ -202,12 +202,12 @@ def test_convert_outer_out_to_in_mit_sot():
     def input_step_fn(y_tm1, y_tm2, rng):
         y_tm1.name = "y_tm1"
         y_tm2.name = "y_tm2"
-        return at.random.normal(y_tm1 + y_tm2, 1.0, rng=rng, name="Y_t")
+        return pt.random.normal(y_tm1 + y_tm2, 1.0, rng=rng, name="Y_t")
 
     Y_rv, _ = pytensor.scan(
         fn=input_step_fn,
         outputs_info=[
-            {"initial": at.as_tensor_variable(np.r_[-1.0, 0.0]), "taps": [-1, -2]},
+            {"initial": pt.as_tensor_variable(np.r_[-1.0, 0.0]), "taps": [-1, -2]},
         ],
         non_sequences=[rng_tt],
         n_steps=10,
@@ -217,7 +217,7 @@ def input_step_fn(y_tm1, y_tm2, rng):
     Y_all = Y_rv.owner.inputs[0]
     Y_all.name = "Y_all"
 
-    Y_obs = at.as_tensor_variable(Y_rv.eval())
+    Y_obs = pt.as_tensor_variable(Y_rv.eval())
     Y_obs.name = "Y_obs"
 
     input_scan_args = ScanArgs.from_node(Y_rv.owner.inputs[0].owner)
@@ -232,7 +232,7 @@ def output_step_fn(y_t, y_tm1, y_tm2):
         y_t.name = "y_t"
         y_tm1.name = "y_tm1"
         y_tm2.name = "y_tm2"
-        logp = logprob(at.random.normal(y_tm1 + y_tm2, 1.0), y_t)
+        logp = _logprob_helper(pt.random.normal(y_tm1 + y_tm2, 1.0), y_t)
         logp.name = "logp(y_t)"
         return logp
 
@@ -282,25 +282,25 @@ def output_step_fn(y_t, y_tm1, y_tm2):
     ],
 )
 def test_scan_joint_logprob(require_inner_rewrites):
-    srng = at.random.RandomStream()
+    srng = pt.random.RandomStream()
 
-    N_tt = at.iscalar("N")
+    N_tt = pt.iscalar("N")
     N_val = 10
     N_tt.tag.test_value = N_val
 
-    M_tt = at.iscalar("M")
+    M_tt = pt.iscalar("M")
     M_val = 2
     M_tt.tag.test_value = M_val
 
-    mus_tt = at.matrix("mus_t")
+    mus_tt = pt.matrix("mus_t")
 
     mus_val = np.stack([np.arange(0.0, 10), np.arange(0.0, -10, -1)], axis=-1).astype(
         pytensor.config.floatX
     )
     mus_tt.tag.test_value = mus_val
 
-    sigmas_tt = at.ones((N_tt,))
-    Gamma_rv = srng.dirichlet(at.ones((M_tt, M_tt)), name="Gamma")
+    sigmas_tt = pt.ones((N_tt,))
+    Gamma_rv = srng.dirichlet(pt.ones((M_tt, M_tt)), name="Gamma")
 
     Gamma_vv = Gamma_rv.clone()
     Gamma_vv.name = "Gamma_vv"
@@ -335,7 +335,8 @@ def scan_fn(mus_t, sigma_t, Gamma_t):
     s_vv = S_rv.clone()
     s_vv.name = "s"
 
-    y_logp = joint_logprob({Y_rv: y_vv, S_rv: s_vv, Gamma_rv: Gamma_vv})
+    y_logp = factorized_joint_logprob({Y_rv: y_vv, S_rv: s_vv, Gamma_rv: Gamma_vv})
+    y_logp_combined = pt.sum([pt.sum(factor) for factor in y_logp.values()])
 
     y_val = np.arange(10)
     s_val = np.array([0, 1, 0, 1, 1, 0, 0, 0, 1, 1])
@@ -349,16 +350,16 @@ def scan_fn(mus_t, sigma_t, Gamma_t):
         Gamma_vv: Gamma_val,
     }
 
-    y_logp_fn = pytensor.function(list(test_point.keys()), y_logp)
+    y_logp_fn = pytensor.function(list(test_point.keys()), y_logp_combined)
 
     assert_no_rvs(y_logp_fn.maker.fgraph.outputs[0])
 
     # Construct the joint log-probability by hand so we can compare it with
     # `y_logp`
     def scan_fn(mus_t, sigma_t, Y_t_val, S_t_val, Gamma_t):
-        S_t = at.random.categorical(Gamma_t[0], name="S_t")
-        Y_t = at.random.normal(mus_t[S_t_val], sigma_t, name="Y_t")
-        Y_t_logp, S_t_logp = logprob(Y_t, Y_t_val), logprob(S_t, S_t_val)
+        S_t = pt.random.categorical(Gamma_t[0], name="S_t")
+        Y_t = pt.random.normal(mus_t[S_t_val], sigma_t, name="Y_t")
+        Y_t_logp, S_t_logp = _logprob_helper(Y_t, Y_t_val), _logprob_helper(S_t, S_t_val)
         Y_t_logp.name = "log(Y_t=y_t)"
         S_t_logp.name = "log(S_t=s_t)"
         return Y_t_logp, S_t_logp
@@ -374,13 +375,13 @@ def scan_fn(mus_t, sigma_t, Y_t_val, S_t_val, Gamma_t):
     Y_rv_logp.name = "logp(Y=y)"
     S_rv_logp.name = "logp(S=s)"
 
-    Gamma_logp = logprob(Gamma_rv, Gamma_vv)
+    Gamma_logp = _logprob_helper(Gamma_rv, Gamma_vv)
 
     y_logp_ref = Y_rv_logp.sum() + S_rv_logp.sum() + Gamma_logp.sum()
 
     assert_no_rvs(y_logp_ref)
 
-    y_logp_val = y_logp.eval(test_point)
+    y_logp_val = y_logp_combined.eval(test_point)
 
     y_logp_ref_val = y_logp_ref.eval(test_point)
 
@@ -391,13 +392,13 @@ def scan_fn(mus_t, sigma_t, Y_t_val, S_t_val, Gamma_t):
 @pytensor.config.change_flags(compute_test_value="raise")
 @pytest.mark.xfail(reason="see #148")
 def test_initial_values():
-    srng = at.random.RandomStream(seed=2320)
+    srng = pt.random.RandomStream(seed=2320)
 
     p_S_0 = np.array([0.9, 0.1])
     S_0_rv = srng.categorical(p_S_0, name="S_0")
     S_0_rv.tag.test_value = 0
 
-    Gamma_at = at.matrix("Gamma")
+    Gamma_at = pt.matrix("Gamma")
     Gamma_at.tag.test_value = np.array([[0, 1], [1, 0]])
 
     s_0_vv = S_0_rv.clone()
@@ -443,14 +444,14 @@ def step_fn(S_tm1, Gamma):
 def test_mode_is_kept(remove_asserts):
     mode = Mode().including("local_remove_all_assert") if remove_asserts else None
     x, _ = pytensor.scan(
-        fn=lambda x: at.random.normal(assert_op(x, x > 0)),
-        outputs_info=[at.ones(())],
+        fn=lambda x: pt.random.normal(assert_op(x, x > 0)),
+        outputs_info=[pt.ones(())],
         n_steps=10,
         mode=mode,
     )
     x.name = "x"
     x_vv = x.clone()
-    x_logp = pytensor.function([x_vv], joint_logprob({x: x_vv}))
+    x_logp = pytensor.function([x_vv], pt.sum(logp(x, x_vv)))
 
     x_test_val = np.full((10,), -1)
     if remove_asserts:
@@ -458,3 +459,98 @@ def test_mode_is_kept(remove_asserts):
     else:
         with pytest.raises(AssertionError):
             x_logp(x=x_test_val)
+
+
+def test_scan_non_pure_rv_output():
+    grw, _ = pytensor.scan(
+        fn=lambda xtm1: pt.random.normal() + xtm1,
+        outputs_info=[pt.zeros(())],
+        n_steps=10,
+        name="grw",
+    )
+
+    grw_vv = grw.clone()
+    grw_logp = logp(grw, grw_vv)
+    assert_no_rvs(grw_logp)
+
+    grw_vv_test = np.arange(10) + 1
+    np.testing.assert_array_almost_equal(
+        grw_logp.eval({grw_vv: grw_vv_test}),
+        stats.norm.logpdf(np.ones(10)),
+    )
+
+
+def test_scan_over_seqs():
+    """Test that logprob inference for scans based on sequences (mapping)."""
+    rng = np.random.default_rng(543)
+    n_steps = 10
+
+    xs = pt.random.normal(size=(n_steps,), name="xs")
+    ys, _ = pytensor.scan(
+        fn=lambda x: pt.random.normal(x), sequences=[xs], outputs_info=[None], name="ys"
+    )
+
+    xs_vv = ys.clone()
+    ys_vv = ys.clone()
+    ys_logp = factorized_joint_logprob({xs: xs_vv, ys: ys_vv})[ys_vv]
+
+    assert_no_rvs(ys_logp)
+
+    xs_test = rng.normal(size=(10,))
+    ys_test = rng.normal(size=(10,))
+    np.testing.assert_array_almost_equal(
+        ys_logp.eval({xs_vv: xs_test, ys_vv: ys_test}),
+        stats.norm.logpdf(ys_test, xs_test),
+    )
+
+
+def test_scan_carried_deterministic_state():
+    """Test logp of scans with carried states downstream of measured variables.
+
+    A moving average model with 2 lags is used for testing.
+    """
+    rng = np.random.default_rng(490)
+    steps = 99
+
+    rho = pt.vector("rho", shape=(2,))
+    sigma = pt.scalar("sigma")
+
+    def ma2_step(eps_tm2, eps_tm1, rho, sigma):
+        mu = eps_tm1 * rho[0] + eps_tm2 * rho[1]
+        y = pt.random.normal(mu, sigma)
+        eps = y - mu
+        update = {y.owner.inputs[0]: y.owner.outputs[0]}
+        return (eps, y), update
+
+    [_, ma2], ma2_updates = pytensor.scan(
+        fn=ma2_step,
+        outputs_info=[{"initial": pt.arange(2, dtype="float64"), "taps": range(-2, 0)}, None],
+        non_sequences=[rho, sigma],
+        n_steps=steps,
+        strict=True,
+        name="ma2",
+    )
+
+    def ref_logp(values, rho, sigma):
+        epsilon_tm2 = 0
+        epsilon_tm1 = 1
+        step_logps = np.zeros_like(values)
+        for t, value in enumerate(values):
+            mu = epsilon_tm1 * rho[0] + epsilon_tm2 * rho[1]
+            step_logps[t] = stats.norm.logpdf(value, mu, sigma)
+            epsilon_tm2 = epsilon_tm1
+            epsilon_tm1 = value - mu
+        return step_logps
+
+    ma2_vv = ma2.clone()
+    logp_expr = logp(ma2, ma2_vv)
+    assert_no_rvs(logp_expr)
+
+    ma2_test = rng.normal(size=(steps,))
+    rho_test = np.array([0.3, 0.7])
+    sigma_test = 0.9
+
+    np.testing.assert_array_almost_equal(
+        logp_expr.eval({ma2_vv: ma2_test, rho: rho_test, sigma: sigma_test}),
+        ref_logp(ma2_test, rho_test, sigma_test),
+    )
diff --git a/tests/logprob/test_tensor.py b/tests/logprob/test_tensor.py
index a0b8810768..64b4cdf6e1 100644
--- a/tests/logprob/test_tensor.py
+++ b/tests/logprob/test_tensor.py
@@ -38,23 +38,22 @@
 import pytensor
 import pytest
 
-from pytensor import tensor as at
+from pytensor import tensor as pt
 from pytensor.graph import RewriteDatabaseQuery
 from pytensor.graph.rewriting.basic import in2out
 from pytensor.graph.rewriting.utils import rewrite_graph
 from pytensor.tensor.extra_ops import BroadcastTo
 from scipy import stats as st
 
-from pymc.logprob import factorized_joint_logprob
+from pymc.logprob.basic import factorized_joint_logprob, logp
 from pymc.logprob.rewriting import logprob_rewrites_db
 from pymc.logprob.tensor import naive_bcast_rv_lift
-from tests.helpers import assert_no_rvs
-from tests.logprob.utils import joint_logprob
+from pymc.testing import assert_no_rvs
 
 
 def test_naive_bcast_rv_lift():
     r"""Make sure `naive_bcast_rv_lift` can handle useless scalar `BroadcastTo`\s."""
-    X_rv = at.random.normal()
+    X_rv = pt.random.normal()
     Z_at = BroadcastTo()(X_rv, ())
 
     # Make sure we're testing what we intend to test
@@ -67,10 +66,10 @@ def test_naive_bcast_rv_lift():
 def test_naive_bcast_rv_lift_valued_var():
     r"""Check that `naive_bcast_rv_lift` won't touch valued variables"""
 
-    x_rv = at.random.normal(name="x")
-    broadcasted_x_rv = at.broadcast_to(x_rv, (2,))
+    x_rv = pt.random.normal(name="x")
+    broadcasted_x_rv = pt.broadcast_to(x_rv, (2,))
 
-    y_rv = at.random.normal(broadcasted_x_rv, name="y")
+    y_rv = pt.random.normal(broadcasted_x_rv, name="y")
 
     x_vv = x_rv.clone()
     y_vv = y_rv.clone()
@@ -86,27 +85,29 @@ def test_naive_bcast_rv_lift_valued_var():
 def test_bcast_rv_logp():
     """Test that derived logp for broadcasted RV is correct"""
 
-    x_rv = at.random.normal(name="x")
-    broadcasted_x_rv = at.broadcast_to(x_rv, (2,))
+    x_rv = pt.random.normal(name="x")
+    broadcasted_x_rv = pt.broadcast_to(x_rv, (2,))
     broadcasted_x_rv.name = "broadcasted_x"
     broadcasted_x_vv = broadcasted_x_rv.clone()
 
-    logp = joint_logprob({broadcasted_x_rv: broadcasted_x_vv}, sum=False)
-    valid_logp = logp.eval({broadcasted_x_vv: [0, 0]})
+    logp = factorized_joint_logprob({broadcasted_x_rv: broadcasted_x_vv})
+    logp_combined = pt.add(*logp.values())
+    valid_logp = logp_combined.eval({broadcasted_x_vv: [0, 0]})
+
     assert valid_logp.shape == ()
     assert np.isclose(valid_logp, st.norm.logpdf(0))
 
     # It's not possible for broadcasted dimensions to have different values
     # This should either raise or return -inf
-    invalid_logp = logp.eval({broadcasted_x_vv: [0, 1]})
+    invalid_logp = logp_combined.eval({broadcasted_x_vv: [0, 1]})
     assert invalid_logp == -np.inf
 
 
 def test_measurable_make_vector():
-    base1_rv = at.random.normal(name="base1")
-    base2_rv = at.random.halfnormal(name="base2")
-    base3_rv = at.random.exponential(name="base3")
-    y_rv = at.stack((base1_rv, base2_rv, base3_rv))
+    base1_rv = pt.random.normal(name="base1")
+    base2_rv = pt.random.halfnormal(name="base2")
+    base3_rv = pt.random.exponential(name="base3")
+    y_rv = pt.stack((base1_rv, base2_rv, base3_rv))
     y_rv.name = "y"
 
     base1_vv = base1_rv.clone()
@@ -114,15 +115,19 @@ def test_measurable_make_vector():
     base3_vv = base3_rv.clone()
     y_vv = y_rv.clone()
 
-    ref_logp = joint_logprob({base1_rv: base1_vv, base2_rv: base2_vv, base3_rv: base3_vv})
-    make_vector_logp = joint_logprob({y_rv: y_vv}, sum=False)
+    ref_logp = factorized_joint_logprob(
+        {base1_rv: base1_vv, base2_rv: base2_vv, base3_rv: base3_vv}
+    )
+    ref_logp_combined = pt.sum([pt.sum(factor) for factor in ref_logp.values()])
+
+    make_vector_logp = logp(y_rv, y_vv)
 
     base1_testval = base1_rv.eval()
     base2_testval = base2_rv.eval()
     base3_testval = base3_rv.eval()
     y_testval = np.stack((base1_testval, base2_testval, base3_testval))
 
-    ref_logp_eval_eval = ref_logp.eval(
+    ref_logp_eval_eval = ref_logp_combined.eval(
         {base1_vv: base1_testval, base2_vv: base2_testval, base3_vv: base3_testval}
     )
     make_vector_logp_eval = make_vector_logp.eval({y_vv: y_testval})
@@ -134,38 +139,42 @@ def test_measurable_make_vector():
 @pytest.mark.parametrize("reverse", (False, True))
 def test_measurable_make_vector_interdependent(reverse):
     """Test that we can obtain a proper graph when stacked RVs depend on each other"""
-    x = at.random.normal(name="x")
+    x = pt.random.normal(name="x")
     y_rvs = []
     prev_rv = x
     for i in range(3):
-        next_rv = at.random.normal(prev_rv + 1, name=f"y{i}")
+        next_rv = pt.random.normal(prev_rv + 1, name=f"y{i}")
         y_rvs.append(next_rv)
         prev_rv = next_rv
 
     if reverse:
         y_rvs = y_rvs[::-1]
 
-    ys = at.stack(y_rvs)
+    ys = pt.stack(y_rvs)
     ys.name = "ys"
 
     x_vv = x.clone()
     ys_vv = ys.clone()
 
-    logp = joint_logprob({x: x_vv, ys: ys_vv})
-    assert_no_rvs(logp)
+    logp = factorized_joint_logprob({x: x_vv, ys: ys_vv})
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
+    assert_no_rvs(logp_combined)
 
     y0_vv = y_rvs[0].clone()
     y1_vv = y_rvs[1].clone()
     y2_vv = y_rvs[2].clone()
 
-    ref_logp = joint_logprob({x: x_vv, y_rvs[0]: y0_vv, y_rvs[1]: y1_vv, y_rvs[2]: y2_vv})
+    ref_logp = factorized_joint_logprob(
+        {x: x_vv, y_rvs[0]: y0_vv, y_rvs[1]: y1_vv, y_rvs[2]: y2_vv}
+    )
+    ref_logp_combined = pt.sum([pt.sum(factor) for factor in ref_logp.values()])
 
     rng = np.random.default_rng()
     x_vv_test = rng.normal()
     ys_vv_test = rng.normal(size=3)
     np.testing.assert_allclose(
-        logp.eval({x_vv: x_vv_test, ys_vv: ys_vv_test}),
-        ref_logp.eval(
+        logp_combined.eval({x_vv: x_vv_test, ys_vv: ys_vv_test}).sum(),
+        ref_logp_combined.eval(
             {x_vv: x_vv_test, y0_vv: ys_vv_test[0], y1_vv: ys_vv_test[1], y2_vv: ys_vv_test[2]}
         ),
     )
@@ -174,38 +183,42 @@ def test_measurable_make_vector_interdependent(reverse):
 @pytest.mark.parametrize("reverse", (False, True))
 def test_measurable_join_interdependent(reverse):
     """Test that we can obtain a proper graph when stacked RVs depend on each other"""
-    x = at.random.normal(name="x")
+    x = pt.random.normal(name="x")
     y_rvs = []
     prev_rv = x
     for i in range(3):
-        next_rv = at.random.normal(prev_rv + 1, name=f"y{i}", size=(1, 2))
+        next_rv = pt.random.normal(prev_rv + 1, name=f"y{i}", size=(1, 2))
         y_rvs.append(next_rv)
         prev_rv = next_rv
 
     if reverse:
         y_rvs = y_rvs[::-1]
 
-    ys = at.concatenate(y_rvs, axis=0)
+    ys = pt.concatenate(y_rvs, axis=0)
     ys.name = "ys"
 
     x_vv = x.clone()
     ys_vv = ys.clone()
 
-    logp = joint_logprob({x: x_vv, ys: ys_vv})
-    assert_no_rvs(logp)
+    logp = factorized_joint_logprob({x: x_vv, ys: ys_vv})
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
+    assert_no_rvs(logp_combined)
 
     y0_vv = y_rvs[0].clone()
     y1_vv = y_rvs[1].clone()
     y2_vv = y_rvs[2].clone()
 
-    ref_logp = joint_logprob({x: x_vv, y_rvs[0]: y0_vv, y_rvs[1]: y1_vv, y_rvs[2]: y2_vv})
+    ref_logp = factorized_joint_logprob(
+        {x: x_vv, y_rvs[0]: y0_vv, y_rvs[1]: y1_vv, y_rvs[2]: y2_vv}
+    )
+    ref_logp_combined = pt.sum([pt.sum(factor) for factor in ref_logp.values()])
 
     rng = np.random.default_rng()
     x_vv_test = rng.normal()
     ys_vv_test = rng.normal(size=(3, 2))
     np.testing.assert_allclose(
-        logp.eval({x_vv: x_vv_test, ys_vv: ys_vv_test}),
-        ref_logp.eval(
+        logp_combined.eval({x_vv: x_vv_test, ys_vv: ys_vv_test}),
+        ref_logp_combined.eval(
             {
                 x_vv: x_vv_test,
                 y0_vv: ys_vv_test[0:1],
@@ -229,12 +242,12 @@ def test_measurable_join_interdependent(reverse):
     ],
 )
 def test_measurable_join_univariate(size1, size2, axis, concatenate):
-    base1_rv = at.random.normal(size=size1, name="base1")
-    base2_rv = at.random.exponential(size=size2, name="base2")
+    base1_rv = pt.random.normal(size=size1, name="base1")
+    base2_rv = pt.random.exponential(size=size2, name="base2")
     if concatenate:
-        y_rv = at.concatenate((base1_rv, base2_rv), axis=axis)
+        y_rv = pt.concatenate((base1_rv, base2_rv), axis=axis)
     else:
-        y_rv = at.stack((base1_rv, base2_rv), axis=axis)
+        y_rv = pt.stack((base1_rv, base2_rv), axis=axis)
     y_rv.name = "y"
 
     base1_vv = base1_rv.clone()
@@ -243,10 +256,10 @@ def test_measurable_join_univariate(size1, size2, axis, concatenate):
 
     base_logps = list(factorized_joint_logprob({base1_rv: base1_vv, base2_rv: base2_vv}).values())
     if concatenate:
-        base_logps = at.concatenate(base_logps, axis=axis)
+        base_logps = pt.concatenate(base_logps, axis=axis)
     else:
-        base_logps = at.stack(base_logps, axis=axis)
-    y_logp = joint_logprob({y_rv: y_vv}, sum=False)
+        base_logps = pt.stack(base_logps, axis=axis)
+    y_logp = logp(y_rv, y_vv)
     assert_no_rvs(y_logp)
 
     base1_testval = base1_rv.eval()
@@ -290,31 +303,31 @@ def test_measurable_join_univariate(size1, size2, axis, concatenate):
     ],
 )
 def test_measurable_join_multivariate(size1, supp_size1, size2, supp_size2, axis, concatenate):
-    base1_rv = at.random.multivariate_normal(
+    base1_rv = pt.random.multivariate_normal(
         np.zeros(supp_size1), np.eye(supp_size1), size=size1, name="base1"
     )
-    base2_rv = at.random.dirichlet(np.ones(supp_size2), size=size2, name="base2")
+    base2_rv = pt.random.dirichlet(np.ones(supp_size2), size=size2, name="base2")
     if concatenate:
-        y_rv = at.concatenate((base1_rv, base2_rv), axis=axis)
+        y_rv = pt.concatenate((base1_rv, base2_rv), axis=axis)
     else:
-        y_rv = at.stack((base1_rv, base2_rv), axis=axis)
+        y_rv = pt.stack((base1_rv, base2_rv), axis=axis)
     y_rv.name = "y"
 
     base1_vv = base1_rv.clone()
     base2_vv = base2_rv.clone()
     y_vv = y_rv.clone()
     base_logps = [
-        at.atleast_1d(logp)
+        pt.atleast_1d(logp)
         for logp in factorized_joint_logprob({base1_rv: base1_vv, base2_rv: base2_vv}).values()
     ]
 
     if concatenate:
         axis_norm = np.core.numeric.normalize_axis_index(axis, base1_rv.ndim)
-        base_logps = at.concatenate(base_logps, axis=axis_norm - 1)
+        base_logps = pt.concatenate(base_logps, axis=axis_norm - 1)
     else:
         axis_norm = np.core.numeric.normalize_axis_index(axis, base1_rv.ndim + 1)
-        base_logps = at.stack(base_logps, axis=axis_norm - 1)
-    y_logp = joint_logprob({y_rv: y_vv}, sum=False)
+        base_logps = pt.stack(base_logps, axis=axis_norm - 1)
+    y_logp = y_logp = logp(y_rv, y_vv)
     assert_no_rvs(y_logp)
 
     base1_testval = base1_rv.eval()
@@ -330,13 +343,13 @@ def test_measurable_join_multivariate(size1, supp_size1, size2, supp_size2, axis
 
 
 def test_join_mixed_ndim_supp():
-    base1_rv = at.random.normal(size=3, name="base1")
-    base2_rv = at.random.dirichlet(np.ones(3), name="base2")
-    y_rv = at.concatenate((base1_rv, base2_rv), axis=0)
+    base1_rv = pt.random.normal(size=3, name="base1")
+    base2_rv = pt.random.dirichlet(np.ones(3), name="base2")
+    y_rv = pt.concatenate((base1_rv, base2_rv), axis=0)
 
     y_vv = y_rv.clone()
     with pytest.raises(ValueError, match="Joined logps have different number of dimensions"):
-        joint_logprob({y_rv: y_vv})
+        logp(y_rv, y_vv)
 
 
 @pytensor.config.change_flags(cxx="")
@@ -361,9 +374,9 @@ def test_join_mixed_ndim_supp():
 @pytest.mark.parametrize("multivariate", (False, True))
 def test_measurable_dimshuffle(ds_order, multivariate):
     if multivariate:
-        base_rv = at.random.dirichlet([1, 2, 3], size=(2, 1))
+        base_rv = pt.random.dirichlet([1, 2, 3], size=(2, 1))
     else:
-        base_rv = at.exp(at.random.beta(1, 2, size=(2, 1, 3)))
+        base_rv = pt.exp(pt.random.beta(1, 2, size=(2, 1, 3)))
 
     ds_rv = base_rv.dimshuffle(ds_order)
     base_vv = base_rv.clone()
@@ -375,20 +388,21 @@ def test_measurable_dimshuffle(ds_order, multivariate):
     else:
         logp_ds_order = ds_order
 
-    ref_logp = joint_logprob({base_rv: base_vv}, sum=False).dimshuffle(logp_ds_order)
+    ref_logp = logp(base_rv, base_vv).dimshuffle(logp_ds_order)
 
     # Disable local_dimshuffle_rv_lift to test fallback Aeppl rewrite
     ir_rewriter = logprob_rewrites_db.query(
         RewriteDatabaseQuery(include=["basic"]).excluding("dimshuffle_lift")
     )
-    ds_logp = joint_logprob({ds_rv: ds_vv}, sum=False, ir_rewriter=ir_rewriter)
-    assert ds_logp is not None
+    ds_logp = factorized_joint_logprob({ds_rv: ds_vv}, ir_rewriter=ir_rewriter)
+    ds_logp_combined = pt.add(*ds_logp.values())
+    assert ds_logp_combined is not None
 
     ref_logp_fn = pytensor.function([base_vv], ref_logp)
-    ds_logp_fn = pytensor.function([ds_vv], ds_logp)
+    ds_logp_fn = pytensor.function([ds_vv], ds_logp_combined)
 
     base_test_value = base_rv.eval()
-    ds_test_value = at.constant(base_test_value).dimshuffle(ds_order).eval()
+    ds_test_value = pt.constant(base_test_value).dimshuffle(ds_order).eval()
 
     np.testing.assert_array_equal(ref_logp_fn(base_test_value), ds_logp_fn(ds_test_value))
 
@@ -397,7 +411,7 @@ def test_unmeargeable_dimshuffles():
     # Test that graphs with DimShuffles that cannot be lifted/merged fail
 
     # Initial support axis is at axis=-1
-    x = at.random.dirichlet(
+    x = pt.random.dirichlet(
         np.ones((3,)),
         size=(4, 2),
     )
@@ -405,11 +419,11 @@ def test_unmeargeable_dimshuffles():
     y = x.dimshuffle((0, 2, 1))
     # Downstream dimshuffle will not be lifted through cumsum. If it ever is,
     # we will need a different measurable Op example
-    z = at.cumsum(y, axis=-2)
+    z = pt.cumsum(y, axis=-2)
     # Support axis is now at axis=-3
     w = z.dimshuffle((1, 0, 2))
 
     w_vv = w.clone()
     # TODO: Check that logp is correct if this type of graphs is ever supported
     with pytest.raises(RuntimeError, match="could not be derived"):
-        joint_logprob({w: w_vv})
+        factorized_joint_logprob({w: w_vv})
diff --git a/tests/logprob/test_transforms.py b/tests/logprob/test_transforms.py
index 7c31a5ee3c..22912d0928 100644
--- a/tests/logprob/test_transforms.py
+++ b/tests/logprob/test_transforms.py
@@ -36,7 +36,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy as sp
 import scipy.special
@@ -49,7 +49,7 @@
 
 from pymc.distributions.transforms import _default_transform, log, logodds
 from pymc.logprob.abstract import MeasurableVariable, _get_measurable_outputs, _logprob
-from pymc.logprob.joint_logprob import factorized_joint_logprob
+from pymc.logprob.basic import factorized_joint_logprob, logp
 from pymc.logprob.transforms import (
     ChainedTransform,
     ExpTransform,
@@ -63,8 +63,7 @@
     TransformValuesRewrite,
     transformed_variable,
 )
-from tests.helpers import assert_no_rvs
-from tests.logprob.utils import joint_logprob
+from pymc.testing import assert_no_rvs
 
 
 class DirichletScipyDist:
@@ -94,15 +93,15 @@ def logpdf(self, value):
 @pytest.mark.parametrize(
     "at_dist, dist_params, sp_dist, size",
     [
-        (at.random.uniform, (0, 1), sp.stats.uniform, ()),
+        (pt.random.uniform, (0, 1), sp.stats.uniform, ()),
         (
-            at.random.pareto,
+            pt.random.pareto,
             (1.5, 10.5),
             lambda b, scale: sp.stats.pareto(b, scale=scale),
             (),
         ),
         (
-            at.random.triangular,
+            pt.random.triangular,
             (1.5, 3.0, 10.5),
             lambda lower, mode, upper: sp.stats.triang(
                 (mode - lower) / (upper - lower), loc=lower, scale=upper - lower
@@ -110,13 +109,13 @@ def logpdf(self, value):
             (),
         ),
         (
-            at.random.halfnormal,
+            pt.random.halfnormal,
             (0, 1),
             sp.stats.halfnorm,
             (),
         ),
         pytest.param(
-            at.random.wald,
+            pt.random.wald,
             (1.5, 10.5),
             lambda mean, scale: sp.stats.invgauss(mean / scale, scale=scale),
             (),
@@ -126,49 +125,49 @@ def logpdf(self, value):
             ),
         ),
         (
-            at.random.exponential,
+            pt.random.exponential,
             (1.5,),
             lambda mu: sp.stats.expon(scale=mu),
             (),
         ),
         pytest.param(
-            at.random.lognormal,
+            pt.random.lognormal,
             (-1.5, 10.5),
             lambda mu, sigma: sp.stats.lognorm(s=sigma, loc=0, scale=np.exp(mu)),
             (),
         ),
         (
-            at.random.lognormal,
+            pt.random.lognormal,
             (-1.5, 1.5),
             lambda mu, sigma: sp.stats.lognorm(s=sigma, scale=np.exp(mu)),
             (),
         ),
         (
-            at.random.halfcauchy,
+            pt.random.halfcauchy,
             (1.5, 10.5),
             lambda alpha, beta: sp.stats.halfcauchy(loc=alpha, scale=beta),
             (),
         ),
         (
-            at.random.gamma,
+            pt.random.gamma,
             (1.5, 10.5),
             lambda alpha, inv_beta: sp.stats.gamma(alpha, scale=1.0 / inv_beta),
             (),
         ),
         (
-            at.random.invgamma,
+            pt.random.invgamma,
             (1.5, 10.5),
             lambda alpha, beta: sp.stats.invgamma(alpha, scale=beta),
             (),
         ),
         (
-            at.random.chisquare,
+            pt.random.chisquare,
             (1.5,),
             lambda df: sp.stats.chi2(df),
             (),
         ),
         pytest.param(
-            at.random.weibull,
+            pt.random.weibull,
             (1.5,),
             lambda c: sp.stats.weibull_min(c),
             (),
@@ -178,31 +177,31 @@ def logpdf(self, value):
             ),
         ),
         (
-            at.random.beta,
+            pt.random.beta,
             (1.5, 1.5),
             lambda alpha, beta: sp.stats.beta(alpha, beta),
             (),
         ),
         (
-            at.random.vonmises,
+            pt.random.vonmises,
             (1.5, 10.5),
             lambda mu, kappa: sp.stats.vonmises(kappa, loc=mu),
             (),
         ),
         (
-            at.random.dirichlet,
+            pt.random.dirichlet,
             (np.array([0.7, 0.3]),),
             lambda alpha: sp.stats.dirichlet(alpha),
             (),
         ),
         (
-            at.random.dirichlet,
+            pt.random.dirichlet,
             (np.array([[0.7, 0.3], [0.9, 0.1]]),),
             lambda alpha: DirichletScipyDist(alpha),
             (),
         ),
         pytest.param(
-            at.random.dirichlet,
+            pt.random.dirichlet,
             (np.array([0.3, 0.7]),),
             lambda alpha: DirichletScipyDist(alpha),
             (3, 2),
@@ -213,7 +212,7 @@ def test_transformed_logprob(at_dist, dist_params, sp_dist, size):
     """
     This test takes a `RandomVariable` type, plus parameters, and uses it to
     construct a variable ``a`` that's used in the graph ``b =
-    at.random.normal(a, 1.0)``.  The transformed log-probability is then
+    pt.random.normal(a, 1.0)``.  The transformed log-probability is then
     computed for ``b``.  We then test that the log-probability of ``a`` is
     properly transformed, as well as any instances of ``a`` that are used
     elsewhere in the graph (i.e. in ``b``), by comparing the graph for the
@@ -225,21 +224,24 @@ def test_transformed_logprob(at_dist, dist_params, sp_dist, size):
 
     a = at_dist(*dist_params, size=size)
     a.name = "a"
-    a_value_var = at.tensor(dtype=a.dtype, shape=(None,) * a.ndim)
+    a_value_var = pt.tensor(dtype=a.dtype, shape=(None,) * a.ndim)
     a_value_var.name = "a_value"
 
-    b = at.random.normal(a, 1.0)
+    b = pt.random.normal(a, 1.0)
     b.name = "b"
     b_value_var = b.clone()
     b_value_var.name = "b_value"
 
     transform = _default_transform(a.owner.op, a)
     transform_rewrite = TransformValuesRewrite({a_value_var: transform})
-    res = joint_logprob({a: a_value_var, b: b_value_var}, extra_rewrites=transform_rewrite)
+    res = factorized_joint_logprob(
+        {a: a_value_var, b: b_value_var}, extra_rewrites=transform_rewrite
+    )
+    res_combined = pt.sum([pt.sum(factor) for factor in res.values()])
 
     test_val_rng = np.random.RandomState(3238)
 
-    logp_vals_fn = pytensor.function([a_value_var, b_value_var], res)
+    logp_vals_fn = pytensor.function([a_value_var, b_value_var], res_combined)
 
     a_forward_fn = pytensor.function([a_value_var], transform.forward(a_value_var, *a.owner.inputs))
     a_backward_fn = pytensor.function(
@@ -303,17 +305,18 @@ def a_backward_fn_(x):
 
 @pytest.mark.parametrize("use_jacobian", [True, False])
 def test_simple_transformed_logprob_nojac(use_jacobian):
-    X_rv = at.random.halfnormal(0, 3, name="X")
+    X_rv = pt.random.halfnormal(0, 3, name="X")
     x_vv = X_rv.clone()
     x_vv.name = "x"
 
     transform_rewrite = TransformValuesRewrite({x_vv: log})
-    tr_logp = joint_logprob(
+    tr_logp = factorized_joint_logprob(
         {X_rv: x_vv}, extra_rewrites=transform_rewrite, use_jacobian=use_jacobian
     )
+    tr_logp_combined = pt.sum([pt.sum(factor) for factor in tr_logp.values()])
 
     assert np.isclose(
-        tr_logp.eval({x_vv: np.log(2.5)}),
+        tr_logp_combined.eval({x_vv: np.log(2.5)}),
         sp.stats.halfnorm(0, 3).logpdf(2.5) + (np.log(2.5) if use_jacobian else 0.0),
     )
 
@@ -329,14 +332,14 @@ class SquareTransform(RVTransform):
         name = "square"
 
         def forward(self, value, *inputs):
-            return at.power(value, 2)
+            return pt.power(value, 2)
 
         def backward(self, value, *inputs):
-            return at.sqrt(value)
+            return pt.sqrt(value)
 
     square_tr = SquareTransform()
 
-    value = at.TensorType("float64", (None,) * ndim)("value")
+    value = pt.TensorType("float64", (None,) * ndim)("value")
     value_tr = square_tr.forward(value)
     log_jac_det = square_tr.log_jac_det(value_tr)
 
@@ -351,9 +354,9 @@ def test_hierarchical_uniform_transform():
     the value var `x`
     """
 
-    lower_rv = at.random.uniform(0, 1, name="lower")
-    upper_rv = at.random.uniform(9, 10, name="upper")
-    x_rv = at.random.uniform(lower_rv, upper_rv, name="x")
+    lower_rv = pt.random.uniform(0, 1, name="lower")
+    upper_rv = pt.random.uniform(9, 10, name="upper")
+    x_rv = pt.random.uniform(lower_rv, upper_rv, name="x")
 
     lower = lower_rv.clone()
     upper = upper_rv.clone()
@@ -366,19 +369,20 @@ def test_hierarchical_uniform_transform():
             x: _default_transform(x_rv.owner.op, x_rv),
         }
     )
-    logp = joint_logprob(
+    logp = factorized_joint_logprob(
         {lower_rv: lower, upper_rv: upper, x_rv: x},
         extra_rewrites=transform_rewrite,
     )
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
 
-    assert_no_rvs(logp)
-    assert not np.isinf(logp.eval({lower: -10, upper: 20, x: -20}))
+    assert_no_rvs(logp_combined)
+    assert not np.isinf(logp_combined.eval({lower: -10, upper: 20, x: -20}))
 
 
 def test_nondefault_transforms():
-    loc_rv = at.random.uniform(-10, 10, name="loc")
-    scale_rv = at.random.uniform(-1, 1, name="scale")
-    x_rv = at.random.normal(loc_rv, scale_rv, name="x")
+    loc_rv = pt.random.uniform(-10, 10, name="loc")
+    scale_rv = pt.random.uniform(-1, 1, name="scale")
+    x_rv = pt.random.normal(loc_rv, scale_rv, name="x")
 
     loc = loc_rv.clone()
     scale = scale_rv.clone()
@@ -392,10 +396,11 @@ def test_nondefault_transforms():
         }
     )
 
-    logp = joint_logprob(
+    logp = factorized_joint_logprob(
         {loc_rv: loc, scale_rv: scale, x_rv: x},
         extra_rewrites=transform_rewrite,
     )
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
 
     # Check numerical evaluation matches with expected transforms
     loc_val = 0
@@ -413,7 +418,7 @@ def test_nondefault_transforms():
     exp_logp += x_val_tr  # log log_jac_det
 
     assert np.isclose(
-        logp.eval({loc: loc_val, scale: scale_val_tr, x: x_val_tr}),
+        logp_combined.eval({loc: loc_val, scale: scale_val_tr, x: x_val_tr}),
         exp_logp,
     )
 
@@ -423,19 +428,20 @@ def test_default_transform_multiout():
 
     # This SVD value is necessarily `1`, but it's generated by an `Op` with
     # multiple outputs and no default output.
-    sd = at.linalg.svd(at.eye(1))[1][0]
-    x_rv = at.random.normal(0, sd, name="x")
+    sd = pt.linalg.svd(pt.eye(1))[1][0]
+    x_rv = pt.random.normal(0, sd, name="x")
     x = x_rv.clone()
 
     transform_rewrite = TransformValuesRewrite({x: None})
 
-    logp = joint_logprob(
+    logp = factorized_joint_logprob(
         {x_rv: x},
         extra_rewrites=transform_rewrite,
     )
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
 
     assert np.isclose(
-        logp.eval({x: 1}),
+        logp_combined.eval({x: 1}),
         sp.stats.norm(0, 1).logpdf(1),
     )
 
@@ -443,7 +449,7 @@ def test_default_transform_multiout():
 @pytest.fixture(scope="module")
 def multiout_measurable_op():
     # Create a dummy Op that just returns the two inputs
-    mu1, mu2 = at.scalars("mu1", "mu2")
+    mu1, mu2 = pt.scalars("mu1", "mu2")
 
     class TestOpFromGraph(OpFromGraph):
         def do_constant_folding(self, fgraph, node):
@@ -481,7 +487,8 @@ def test_nondefault_transform_multiout(transform_x, transform_y, multiout_measur
         }
     )
 
-    logp = joint_logprob({x: x_vv, y: y_vv}, extra_rewrites=transform_rewrite)
+    logp = factorized_joint_logprob({x: x_vv, y: y_vv}, extra_rewrites=transform_rewrite)
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
 
     x_vv_test = np.random.normal()
     y_vv_test = np.abs(np.random.normal())
@@ -497,11 +504,13 @@ def test_nondefault_transform_multiout(transform_x, transform_y, multiout_measur
     else:
         expected_logp += np.log(y_vv_test) + 2 - np.log(y_vv_test)
 
-    np.testing.assert_almost_equal(logp.eval({x_vv: x_vv_test, y_vv: y_vv_test}), expected_logp)
+    np.testing.assert_almost_equal(
+        logp_combined.eval({x_vv: x_vv_test, y_vv: y_vv_test}), expected_logp
+    )
 
 
 def test_TransformValuesMapping():
-    x = at.vector()
+    x = pt.vector()
     fg = FunctionGraph(outputs=[x])
 
     tvm = TransformValuesMapping({})
@@ -518,7 +527,7 @@ def test_original_values_output_dict():
     Test that the original unconstrained value variable appears an the key of
     the logprob factor
     """
-    p_rv = at.random.beta(1, 1, name="p")
+    p_rv = pt.random.beta(1, 1, name="p")
     p_vv = p_rv.clone()
 
     tr = TransformValuesRewrite({p_vv: logodds})
@@ -533,13 +542,13 @@ def test_mixture_transform():
     This test is specific to `MixtureRV`, which is derived from an `OpFromGraph`.
     """
 
-    I_rv = at.random.bernoulli(0.5, name="I")
-    Y_1_rv = at.random.beta(100, 1, name="Y_1")
-    Y_2_rv = at.random.beta(1, 100, name="Y_2")
+    I_rv = pt.random.bernoulli(0.5, name="I")
+    Y_1_rv = pt.random.beta(100, 1, name="Y_1")
+    Y_2_rv = pt.random.beta(1, 100, name="Y_2")
 
     # A `MixtureRV`, which is an `OpFromGraph` subclass, will replace this
-    # `at.stack` in the graph
-    Y_rv = at.stack([Y_1_rv, Y_2_rv])[I_rv]
+    # `pt.stack` in the graph
+    Y_rv = pt.stack([Y_1_rv, Y_2_rv])[I_rv]
     Y_rv.name = "Y"
 
     i_vv = I_rv.clone()
@@ -547,35 +556,37 @@ def test_mixture_transform():
     y_vv = Y_rv.clone()
     y_vv.name = "y"
 
-    logp_no_trans = joint_logprob(
+    logp_no_trans = factorized_joint_logprob(
         {Y_rv: y_vv, I_rv: i_vv},
     )
+    logp_no_trans_comb = pt.sum([pt.sum(factor) for factor in logp_no_trans.values()])
 
     transform_rewrite = TransformValuesRewrite({y_vv: LogTransform()})
 
     with pytest.warns(None) as record:
         # This shouldn't raise any warnings
-        logp_trans = joint_logprob(
+        logp_trans = factorized_joint_logprob(
             {Y_rv: y_vv, I_rv: i_vv},
             extra_rewrites=transform_rewrite,
             use_jacobian=False,
         )
+        logp_trans_combined = pt.sum([pt.sum(factor) for factor in logp_trans.values()])
 
     assert not record.list
 
     # The untransformed graph should be the same as the transformed graph after
     # replacing the `Y_rv` value variable with a transformed version of itself
-    logp_nt_fg = FunctionGraph(outputs=[logp_no_trans], clone=False)
-    y_trans = transformed_variable(at.exp(y_vv), y_vv)
+    logp_nt_fg = FunctionGraph(outputs=[logp_no_trans_comb], clone=False)
+    y_trans = transformed_variable(pt.exp(y_vv), y_vv)
     y_trans.name = "y_log"
     logp_nt_fg.replace(y_vv, y_trans)
     logp_nt = logp_nt_fg.outputs[0]
 
-    assert equal_computations([logp_nt], [logp_trans])
+    assert equal_computations([logp_nt], [logp_trans_combined])
 
 
 def test_invalid_interval_transform():
-    x_rv = at.random.normal(0, 1)
+    x_rv = pt.random.normal(0, 1)
     x_vv = x_rv.clone()
 
     msg = "Both edges of IntervalTransform cannot be None"
@@ -599,17 +610,17 @@ def test_chained_transform():
     ch = ChainedTransform(
         transform_list=[
             ScaleTransform(
-                transform_args_fn=lambda *inputs: at.constant(scale),
+                transform_args_fn=lambda *inputs: pt.constant(scale),
             ),
             ExpTransform(),
             LocTransform(
-                transform_args_fn=lambda *inputs: at.constant(loc),
+                transform_args_fn=lambda *inputs: pt.constant(loc),
             ),
         ],
-        base_op=at.random.multivariate_normal,
+        base_op=pt.random.multivariate_normal,
     )
 
-    x = at.random.multivariate_normal(np.zeros(3), np.eye(3))
+    x = pt.random.multivariate_normal(np.zeros(3), np.eye(3))
     x_val = x.eval()
 
     x_val_forward = ch.forward(x_val, *x.owner.inputs).eval()
@@ -626,19 +637,19 @@ def test_chained_transform():
 
     log_jac_det = ch.log_jac_det(x_val_forward, *x.owner.inputs, scale, loc)
     assert np.isclose(
-        log_jac_det.eval(),
-        -np.log(scale) - np.sum(np.log(x_val_forward - loc)),
+        pt.sum(log_jac_det).eval(),
+        np.sum(-np.log(scale) - np.log(x_val_forward - loc)),
     )
 
 
 def test_exp_transform_rv():
-    base_rv = at.random.normal(0, 1, size=3, name="base_rv")
-    y_rv = at.exp(base_rv)
+    base_rv = pt.random.normal(0, 1, size=3, name="base_rv")
+    y_rv = pt.exp(base_rv)
     y_rv.name = "y"
 
     y_vv = y_rv.clone()
-    logp = joint_logprob({y_rv: y_vv}, sum=False)
-    logp_fn = pytensor.function([y_vv], logp)
+    logprob = logp(y_rv, y_vv)
+    logp_fn = pytensor.function([y_vv], logprob)
 
     y_val = [-2.0, 0.1, 0.3]
     np.testing.assert_allclose(
@@ -648,13 +659,13 @@ def test_exp_transform_rv():
 
 
 def test_log_transform_rv():
-    base_rv = at.random.lognormal(0, 1, size=2, name="base_rv")
-    y_rv = at.log(base_rv)
+    base_rv = pt.random.lognormal(0, 1, size=2, name="base_rv")
+    y_rv = pt.log(base_rv)
     y_rv.name = "y"
 
     y_vv = y_rv.clone()
-    logp = joint_logprob({y_rv: y_vv}, sum=False)
-    logp_fn = pytensor.function([y_vv], logp)
+    logprob = logp(y_rv, y_vv)
+    logp_fn = pytensor.function([y_vv], logprob)
 
     y_val = [0.1, 0.3]
     np.testing.assert_allclose(
@@ -666,23 +677,23 @@ def test_log_transform_rv():
 @pytest.mark.parametrize(
     "rv_size, loc_type, addition",
     [
-        (None, at.scalar, True),
-        (2, at.vector, False),
-        ((2, 1), at.col, True),
+        (None, pt.scalar, True),
+        (2, pt.vector, False),
+        ((2, 1), pt.col, True),
     ],
 )
 def test_loc_transform_rv(rv_size, loc_type, addition):
     loc = loc_type("loc")
     if addition:
-        y_rv = loc + at.random.normal(0, 1, size=rv_size, name="base_rv")
+        y_rv = loc + pt.random.normal(0, 1, size=rv_size, name="base_rv")
     else:
-        y_rv = at.random.normal(0, 1, size=rv_size, name="base_rv") - at.neg(loc)
+        y_rv = pt.random.normal(0, 1, size=rv_size, name="base_rv") - pt.neg(loc)
     y_rv.name = "y"
     y_vv = y_rv.clone()
 
-    logp = joint_logprob({y_rv: y_vv}, sum=False)
-    assert_no_rvs(logp)
-    logp_fn = pytensor.function([loc, y_vv], logp)
+    logprob = logp(y_rv, y_vv)
+    assert_no_rvs(logprob)
+    logp_fn = pytensor.function([loc, y_vv], logprob)
 
     loc_test_val = np.full(rv_size, 4.0)
     y_test_val = np.full(rv_size, 1.0)
@@ -696,23 +707,23 @@ def test_loc_transform_rv(rv_size, loc_type, addition):
 @pytest.mark.parametrize(
     "rv_size, scale_type, product",
     [
-        (None, at.scalar, True),
-        (1, at.TensorType("floatX", (True,)), True),
-        ((2, 3), at.matrix, False),
+        (None, pt.scalar, True),
+        (1, pt.TensorType("floatX", (True,)), True),
+        ((2, 3), pt.matrix, False),
     ],
 )
 def test_scale_transform_rv(rv_size, scale_type, product):
     scale = scale_type("scale")
     if product:
-        y_rv = at.random.normal(0, 1, size=rv_size, name="base_rv") * scale
+        y_rv = pt.random.normal(0, 1, size=rv_size, name="base_rv") * scale
     else:
-        y_rv = at.random.normal(0, 1, size=rv_size, name="base_rv") / at.reciprocal(scale)
+        y_rv = pt.random.normal(0, 1, size=rv_size, name="base_rv") / pt.reciprocal(scale)
     y_rv.name = "y"
     y_vv = y_rv.clone()
 
-    logp = joint_logprob({y_rv: y_vv}, sum=False)
-    assert_no_rvs(logp)
-    logp_fn = pytensor.function([scale, y_vv], logp)
+    logprob = logp(y_rv, y_vv)
+    assert_no_rvs(logprob)
+    logp_fn = pytensor.function([scale, y_vv], logprob)
 
     scale_test_val = np.full(rv_size, 4.0)
     y_test_val = np.full(rv_size, 1.0)
@@ -724,15 +735,16 @@ def test_scale_transform_rv(rv_size, scale_type, product):
 
 
 def test_transformed_rv_and_value():
-    y_rv = at.random.halfnormal(-1, 1, name="base_rv") + 1
+    y_rv = pt.random.halfnormal(-1, 1, name="base_rv") + 1
     y_rv.name = "y"
     y_vv = y_rv.clone()
 
     transform_rewrite = TransformValuesRewrite({y_vv: LogTransform()})
 
-    logp = joint_logprob({y_rv: y_vv}, extra_rewrites=transform_rewrite)
-    assert_no_rvs(logp)
-    logp_fn = pytensor.function([y_vv], logp)
+    logp = factorized_joint_logprob({y_rv: y_vv}, extra_rewrites=transform_rewrite)
+    logp_combined = pt.sum([pt.sum(factor) for factor in logp.values()])
+    assert_no_rvs(logp_combined)
+    logp_fn = pytensor.function([y_vv], logp_combined)
 
     y_test_val = -5
 
@@ -743,60 +755,60 @@ def test_transformed_rv_and_value():
 
 
 def test_loc_transform_multiple_rvs_fails1():
-    x_rv1 = at.random.normal(name="x_rv1")
-    x_rv2 = at.random.normal(name="x_rv2")
+    x_rv1 = pt.random.normal(name="x_rv1")
+    x_rv2 = pt.random.normal(name="x_rv2")
     y_rv = x_rv1 + x_rv2
 
     y = y_rv.clone()
 
     with pytest.raises(RuntimeError, match="could not be derived"):
-        joint_logprob({y_rv: y})
+        factorized_joint_logprob({y_rv: y})
 
 
 def test_nested_loc_transform_multiple_rvs_fails2():
-    x_rv1 = at.random.normal(name="x_rv1")
-    x_rv2 = at.cos(at.random.normal(name="x_rv2"))
+    x_rv1 = pt.random.normal(name="x_rv1")
+    x_rv2 = pt.cos(pt.random.normal(name="x_rv2"))
     y_rv = x_rv1 + x_rv2
 
     y = y_rv.clone()
 
     with pytest.raises(RuntimeError, match="could not be derived"):
-        joint_logprob({y_rv: y})
+        factorized_joint_logprob({y_rv: y})
 
 
 def test_discrete_rv_unary_transform_fails():
-    y_rv = at.exp(at.random.poisson(1))
+    y_rv = pt.exp(pt.random.poisson(1))
     with pytest.raises(RuntimeError, match="could not be derived"):
-        joint_logprob({y_rv: y_rv.clone()})
+        factorized_joint_logprob({y_rv: y_rv.clone()})
 
 
 def test_discrete_rv_multinary_transform_fails():
-    y_rv = 5 + at.random.poisson(1)
+    y_rv = 5 + pt.random.poisson(1)
     with pytest.raises(RuntimeError, match="could not be derived"):
-        joint_logprob({y_rv: y_rv.clone()})
+        factorized_joint_logprob({y_rv: y_rv.clone()})
 
 
 @pytest.mark.xfail(reason="Check not implemented yet")
 def test_invalid_broadcasted_transform_rv_fails():
-    loc = at.vector("loc")
-    y_rv = loc + at.random.normal(0, 1, size=1, name="base_rv")
+    loc = pt.vector("loc")
+    y_rv = loc + pt.random.normal(0, 1, size=1, name="base_rv")
     y_rv.name = "y"
     y_vv = y_rv.clone()
 
     # This logp derivation should fail or count only once the values that are broadcasted
-    logp = joint_logprob({y_rv: y_vv}, sum=False)
-    assert logp.eval({y_vv: [0, 0, 0, 0], loc: [0, 0, 0, 0]}).shape == ()
+    logprob = logp(y_rv, y_vv)
+    assert logprob.eval({y_vv: [0, 0, 0, 0], loc: [0, 0, 0, 0]}).shape == ()
 
 
 @pytest.mark.parametrize("numerator", (1.0, 2.0))
 def test_reciprocal_rv_transform(numerator):
     shape = 3
     scale = 5
-    x_rv = numerator / at.random.gamma(shape, scale, size=(2,))
+    x_rv = numerator / pt.random.gamma(shape, scale, size=(2,))
     x_rv.name = "x"
 
     x_vv = x_rv.clone()
-    x_logp_fn = pytensor.function([x_vv], joint_logprob({x_rv: x_vv}, sum=False))
+    x_logp_fn = pytensor.function([x_vv], logp(x_rv, x_vv))
 
     x_test_val = np.r_[-0.5, 1.5]
     assert np.allclose(
@@ -807,11 +819,11 @@ def test_reciprocal_rv_transform(numerator):
 
 def test_sqr_transform():
     # The square of a unit normal is a chi-square with 1 df
-    x_rv = at.random.normal(0, 1, size=(4,)) ** 2
+    x_rv = pt.random.normal(0, 1, size=(4,)) ** 2
     x_rv.name = "x"
 
     x_vv = x_rv.clone()
-    x_logp_fn = pytensor.function([x_vv], joint_logprob({x_rv: x_vv}, sum=False))
+    x_logp_fn = pytensor.function([x_vv], logp(x_rv, x_vv))
 
     x_test_val = np.r_[-0.5, 0.5, 1, 2.5]
     assert np.allclose(
@@ -822,11 +834,11 @@ def test_sqr_transform():
 
 def test_sqrt_transform():
     # The sqrt of a chisquare with n df is a chi distribution with n df
-    x_rv = at.sqrt(at.random.chisquare(df=3, size=(4,)))
+    x_rv = pt.sqrt(pt.random.chisquare(df=3, size=(4,)))
     x_rv.name = "x"
 
     x_vv = x_rv.clone()
-    x_logp_fn = pytensor.function([x_vv], joint_logprob({x_rv: x_vv}, sum=False))
+    x_logp_fn = pytensor.function([x_vv], logp(x_rv, x_vv))
 
     x_test_val = np.r_[-2.5, 0.5, 1, 2.5]
     assert np.allclose(
@@ -838,11 +850,11 @@ def test_sqrt_transform():
 @pytest.mark.parametrize("power", (-3, -1, 1, 5, 7))
 def test_negative_value_odd_power_transform(power):
     # check that negative values and odd powers evaluate to a finite logp
-    x_rv = at.random.normal() ** power
+    x_rv = pt.random.normal() ** power
     x_rv.name = "x"
 
     x_vv = x_rv.clone()
-    x_logp_fn = pytensor.function([x_vv], joint_logprob({x_rv: x_vv}, sum=False))
+    x_logp_fn = pytensor.function([x_vv], logp(x_rv, x_vv))
 
     assert np.isfinite(x_logp_fn(1))
     assert np.isfinite(x_logp_fn(-1))
@@ -851,11 +863,11 @@ def test_negative_value_odd_power_transform(power):
 @pytest.mark.parametrize("power", (-2, 2, 4, 6, 8))
 def test_negative_value_even_power_transform(power):
     # check that negative values and odd powers evaluate to -inf logp
-    x_rv = at.random.normal() ** power
+    x_rv = pt.random.normal() ** power
     x_rv.name = "x"
 
     x_vv = x_rv.clone()
-    x_logp_fn = pytensor.function([x_vv], joint_logprob({x_rv: x_vv}, sum=False))
+    x_logp_fn = pytensor.function([x_vv], logp(x_rv, x_vv))
 
     assert np.isfinite(x_logp_fn(1))
     assert np.isneginf(x_logp_fn(-1))
@@ -864,11 +876,11 @@ def test_negative_value_even_power_transform(power):
 @pytest.mark.parametrize("power", (-1 / 3, -1 / 2, 1 / 2, 1 / 3))
 def test_negative_value_frac_power_transform(power):
     # check that negative values and fractional powers evaluate to -inf logp
-    x_rv = at.random.normal() ** power
+    x_rv = pt.random.normal() ** power
     x_rv.name = "x"
 
     x_vv = x_rv.clone()
-    x_logp_fn = pytensor.function([x_vv], joint_logprob({x_rv: x_vv}, sum=False))
+    x_logp_fn = pytensor.function([x_vv], logp(x_rv, x_vv))
 
     assert np.isfinite(x_logp_fn(2.5))
     assert np.isneginf(x_logp_fn(-2.5))
@@ -876,34 +888,34 @@ def test_negative_value_frac_power_transform(power):
 
 @pytest.mark.parametrize("test_val", (2.5, -2.5))
 def test_absolute_transform(test_val):
-    x_rv = at.abs(at.random.normal())
-    y_rv = at.random.halfnormal()
+    x_rv = pt.abs(pt.random.normal())
+    y_rv = pt.random.halfnormal()
 
     x_vv = x_rv.clone()
     y_vv = y_rv.clone()
-    x_logp_fn = pytensor.function([x_vv], joint_logprob({x_rv: x_vv}, sum=False))
-    y_logp_fn = pytensor.function([y_vv], joint_logprob({y_rv: y_vv}, sum=False))
+    x_logp_fn = pytensor.function([x_vv], logp(x_rv, x_vv))
+    y_logp_fn = pytensor.function([y_vv], logp(y_rv, y_vv))
 
     assert np.allclose(x_logp_fn(test_val), y_logp_fn(test_val))
 
 
 def test_negated_rv_transform():
-    x_rv = -at.random.halfnormal()
+    x_rv = -pt.random.halfnormal()
     x_rv.name = "x"
 
     x_vv = x_rv.clone()
-    x_logp_fn = pytensor.function([x_vv], joint_logprob({x_rv: x_vv}))
+    x_logp_fn = pytensor.function([x_vv], pt.sum(logp(x_rv, x_vv)))
 
     assert np.isclose(x_logp_fn(-1.5), sp.stats.halfnorm.logpdf(1.5))
 
 
 def test_subtracted_rv_transform():
     # Choose base RV that is asymmetric around zero
-    x_rv = 5.0 - at.random.normal(1.0)
+    x_rv = 5.0 - pt.random.normal(1.0)
     x_rv.name = "x"
 
     x_vv = x_rv.clone()
-    x_logp_fn = pytensor.function([x_vv], joint_logprob({x_rv: x_vv}))
+    x_logp_fn = pytensor.function([x_vv], pt.sum(logp(x_rv, x_vv)))
 
     assert np.isclose(x_logp_fn(7.3), sp.stats.norm.logpdf(5.0 - 7.3, 1.0))
 
@@ -911,11 +923,16 @@ def test_subtracted_rv_transform():
 def test_scan_transform():
     """Test that Scan valued variables can be transformed"""
 
-    init = at.random.beta(1, 1, name="init")
+    init = pt.random.beta(1, 1, name="init")
     init_vv = init.clone()
 
+    def scan_step(prev_innov):
+        next_innov = pt.random.beta(prev_innov * 10, (1 - prev_innov) * 10)
+        update = {next_innov.owner.inputs[0]: next_innov.owner.outputs[0]}
+        return next_innov, update
+
     innov, _ = scan(
-        fn=lambda prev_innov: at.random.beta(prev_innov * 10, (1 - prev_innov) * 10),
+        fn=scan_step,
         outputs_info=[init],
         n_steps=4,
     )
@@ -937,10 +954,10 @@ def test_scan_transform():
     innov = []
     prev_innov = init
     for i in range(4):
-        next_innov = at.random.beta(prev_innov * 10, (1 - prev_innov) * 10, name=f"innov[i]")
+        next_innov = pt.random.beta(prev_innov * 10, (1 - prev_innov) * 10, name=f"innov[i]")
         innov.append(next_innov)
         prev_innov = next_innov
-    innov = at.stack(innov)
+    innov = pt.stack(innov)
     innov.name = "innov"
 
     tr = TransformValuesRewrite(
@@ -959,3 +976,28 @@ def test_scan_transform():
         "innov": np.full((4,), -0.5),
     }
     np.testing.assert_allclose(logp_fn(**test_point), ref_logp_fn(**test_point))
+
+
+@pytest.mark.parametrize("shift", [1.5, np.array([-0.5, 1, 0.3])])
+@pytest.mark.parametrize("scale", [2.0, np.array([1.5, 3.3, 1.0])])
+def test_multivariate_transform(shift, scale):
+    mu = np.array([0, 0.9, -2.1])
+    cov = np.array([[1, 0, 0.9], [0, 1, 0], [0.9, 0, 1]])
+    x_rv_raw = pt.random.multivariate_normal(mu, cov=cov)
+    x_rv = shift + x_rv_raw * scale
+    x_rv.name = "x"
+
+    x_vv = x_rv.clone()
+    logp = factorized_joint_logprob({x_rv: x_vv})[x_vv]
+    assert_no_rvs(logp)
+
+    x_vv_test = np.array([5.0, 4.9, -6.3])
+    scale_mat = scale * np.eye(x_vv_test.shape[0])
+    np.testing.assert_almost_equal(
+        logp.eval({x_vv: x_vv_test}),
+        sp.stats.multivariate_normal.logpdf(
+            x_vv_test,
+            shift + mu * scale,
+            scale_mat @ cov @ scale_mat.T,
+        ),
+    )
diff --git a/tests/logprob/test_utils.py b/tests/logprob/test_utils.py
index 8fe398195c..363e94c76e 100644
--- a/tests/logprob/test_utils.py
+++ b/tests/logprob/test_utils.py
@@ -38,7 +38,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from pytensor import function
@@ -47,8 +47,8 @@
 
 import pymc as pm
 
-from pymc.logprob.abstract import MeasurableVariable, get_measurable_outputs, logprob
-from pymc.logprob.joint_logprob import joint_logp
+from pymc.logprob.abstract import MeasurableVariable, get_measurable_outputs
+from pymc.logprob.basic import joint_logp, logp
 from pymc.logprob.utils import (
     ParameterValueError,
     dirac_delta,
@@ -57,20 +57,20 @@
     rvs_to_value_vars,
     walk_model,
 )
-from tests.helpers import assert_no_rvs
+from pymc.testing import assert_no_rvs
 from tests.logprob.utils import create_pytensor_params, scipy_logprob_tester
 
 
 def test_walk_model():
-    d = at.vector("d")
-    b = at.vector("b")
+    d = pt.vector("d")
+    b = pt.vector("b")
     c = uniform(0.0, d)
     c.name = "c"
-    e = at.log(c)
+    e = pt.log(c)
     a = normal(e, b)
     a.name = "a"
 
-    test_graph = at.exp(a + 1)
+    test_graph = pt.exp(a + 1)
     res = list(walk_model((test_graph,)))
     assert a in res
     assert c not in res
@@ -85,28 +85,28 @@ def test_walk_model():
 
 
 def test_rvs_to_value_vars():
-    a = at.random.uniform(0.0, 1.0)
+    a = pt.random.uniform(0.0, 1.0)
     a.name = "a"
     a.tag.value_var = a_value_var = a.clone()
 
-    b = at.random.uniform(0, a + 1.0)
+    b = pt.random.uniform(0, a + 1.0)
     b.name = "b"
     b.tag.value_var = b_value_var = b.clone()
 
-    c = at.random.normal()
+    c = pt.random.normal()
     c.name = "c"
     c.tag.value_var = c_value_var = c.clone()
 
-    d = at.log(c + b) + 2.0
+    d = pt.log(c + b) + 2.0
 
     initial_replacements = {b: b_value_var, c: c_value_var}
     (res,), replaced = rvs_to_value_vars((d,), initial_replacements=initial_replacements)
 
-    assert res.owner.op == at.add
+    assert res.owner.op == pt.add
     log_output = res.owner.inputs[0]
-    assert log_output.owner.op == at.log
+    assert log_output.owner.op == pt.log
     log_add_output = res.owner.inputs[0].owner.inputs[0]
-    assert log_add_output.owner.op == at.add
+    assert log_add_output.owner.op == pt.add
     c_output = log_add_output.owner.inputs[0]
 
     # We make sure that the random variables were replaced
@@ -127,19 +127,19 @@ def test_rvs_to_value_vars():
 
 def test_rvs_to_value_vars_intermediate_rv():
     """Test that function replaces values above an intermediate RV."""
-    a = at.random.uniform(0.0, 1.0)
+    a = pt.random.uniform(0.0, 1.0)
     a.name = "a"
     a.tag.value_var = a_value_var = a.clone()
 
-    b = at.random.uniform(0, a + 1.0)
+    b = pt.random.uniform(0, a + 1.0)
     b.name = "b"
     b.tag.value_var = b.clone()
 
-    c = at.random.normal()
+    c = pt.random.normal()
     c.name = "c"
     c.tag.value_var = c_value_var = c.clone()
 
-    d = at.log(c + b) + 2.0
+    d = pt.log(c + b) + 2.0
 
     initial_replacements = {a: a_value_var, c: c_value_var}
     (res,), replaced = rvs_to_value_vars((d,), initial_replacements=initial_replacements)
@@ -159,11 +159,11 @@ def test_rvs_to_value_vars_intermediate_rv():
 
 
 def test_CheckParameter():
-    mu = at.constant(0)
-    sigma = at.scalar("sigma")
-    x_rv = at.random.normal(mu, sigma, name="x")
-    x_vv = at.constant(0)
-    x_logp = logprob(x_rv, x_vv)
+    mu = pt.constant(0)
+    sigma = pt.scalar("sigma")
+    x_rv = pt.random.normal(mu, sigma, name="x")
+    x_vv = pt.constant(0)
+    x_logp = logp(x_rv, x_vv)
 
     x_logp_fn = function([sigma], x_logp)
     with pytest.raises(ParameterValueError, match="sigma > 0"):
@@ -172,7 +172,7 @@ def test_CheckParameter():
 
 def test_dirac_delta():
     fn = pytensor.function(
-        [], dirac_delta(at.as_tensor(1)), mode=get_default_mode().excluding("useless")
+        [], dirac_delta(pt.as_tensor(1)), mode=get_default_mode().excluding("useless")
     )
     with pytest.warns(UserWarning, match=".*DiracDelta.*"):
         assert np.array_equal(fn(), 1)
diff --git a/tests/logprob/utils.py b/tests/logprob/utils.py
index 644d2a83db..368b77a5f6 100644
--- a/tests/logprob/utils.py
+++ b/tests/logprob/utils.py
@@ -39,67 +39,14 @@
 import numpy as np
 
 from pytensor import tensor as pt
-from pytensor.graph.basic import walk
-from pytensor.graph.op import HasInnerGraph
 from pytensor.tensor.var import TensorVariable
 from scipy import stats as stats
 
-from pymc.logprob import factorized_joint_logprob
-from pymc.logprob.abstract import (
-    MeasurableVariable,
-    get_measurable_outputs,
-    icdf,
-    logcdf,
-    logprob,
-)
+from pymc.logprob import factorized_joint_logprob, icdf, logcdf, logp
+from pymc.logprob.abstract import get_measurable_outputs
 from pymc.logprob.utils import ignore_logprob
 
 
-def joint_logprob(*args, sum: bool = True, **kwargs) -> Optional[TensorVariable]:
-    """Create a graph representing the joint log-probability/measure of a graph.
-
-    This function calls `factorized_joint_logprob` and returns the combined
-    log-probability factors as a single graph.
-
-    Parameters
-    ----------
-    sum: bool
-        If ``True`` each factor is collapsed to a scalar via ``sum`` before
-        being joined with the remaining factors. This may be necessary to
-        avoid incorrect broadcasting among independent factors.
-
-    """
-    logprob = factorized_joint_logprob(*args, **kwargs)
-    if not logprob:
-        return None
-    if len(logprob) == 1:
-        logprob = tuple(logprob.values())[0]
-        if sum:
-            return pt.sum(logprob)
-        return logprob
-    if sum:
-        return pt.sum([pt.sum(factor) for factor in logprob.values()])
-    return pt.add(*logprob.values())
-
-
-def assert_no_rvs(var):
-    """Assert that there are no `MeasurableVariable` nodes in a graph."""
-
-    def expand(r):
-        owner = r.owner
-        if owner:
-            inputs = list(reversed(owner.inputs))
-
-            if isinstance(owner.op, HasInnerGraph):
-                inputs += owner.op.inner_outputs
-
-            return inputs
-
-    for v in walk([var], expand, False):
-        if v.owner and isinstance(v.owner.op, MeasurableVariable):
-            raise AssertionError(f"Variable {v} is a MeasurableVariable")
-
-
 def simulate_poiszero_hmm(
     N, mu=10.0, pi_0_a=np.r_[1, 1], p_0_a=np.r_[5, 1], p_1_a=np.r_[1, 1], seed=None
 ):
@@ -188,7 +135,7 @@ def scipy_logprob_tester(
         test_fn = getattr(stats, name)
 
     if test == "logprob":
-        pytensor_res = logprob(rv_var, pt.as_tensor(obs))
+        pytensor_res = logp(rv_var, pt.as_tensor(obs))
     elif test == "logcdf":
         pytensor_res = logcdf(rv_var, pt.as_tensor(obs))
     elif test == "icdf":
diff --git a/tests/models.py b/tests/models.py
index fd18a91102..70ad4dc768 100644
--- a/tests/models.py
+++ b/tests/models.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from pytensor.compile.ops import as_op
 
@@ -38,7 +38,7 @@ def simple_model():
 def another_simple_model():
     _, _model, _ = simple_model()
     with _model:
-        pm.Potential("pot", at.ones((10, 10)))
+        pm.Potential("pot", pt.ones((10, 10)))
     return _model
 
 
@@ -63,7 +63,7 @@ def multidimensional_model():
 
 
 def simple_arbitrary_det():
-    scalar_type = at.dscalar if pytensor.config.floatX == "float64" else at.fscalar
+    scalar_type = pt.dscalar if pytensor.config.floatX == "float64" else pt.fscalar
 
     @as_op(itypes=[scalar_type], otypes=[scalar_type])
     def arbitrary_det(value):
@@ -89,7 +89,7 @@ def simple_2model():
     p = 0.4
     with Model() as model:
         x = pm.Normal("x", mu, tau=tau, initval=0.1)
-        pm.Deterministic("logx", at.log(x))
+        pm.Deterministic("logx", pt.log(x))
         pm.Bernoulli("y", p)
     return model.initial_point(), model
 
@@ -99,7 +99,7 @@ def simple_2model_continuous():
     tau = 1.3
     with Model() as model:
         x = pm.Normal("x", mu, tau=tau, initval=0.1)
-        pm.Deterministic("logx", at.log(x))
+        pm.Deterministic("logx", pt.log(x))
         pm.Beta("y", alpha=1, beta=1, size=2)
     return model.initial_point(), model
 
@@ -111,8 +111,8 @@ def mv_simple():
     with pm.Model() as model:
         pm.MvNormal(
             "x",
-            at.constant(mu),
-            tau=at.constant(tau),
+            pt.constant(mu),
+            tau=pt.constant(tau),
             initval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
@@ -127,8 +127,8 @@ def mv_simple_coarse():
     with pm.Model() as model:
         pm.MvNormal(
             "x",
-            at.constant(mu),
-            tau=at.constant(tau),
+            pt.constant(mu),
+            tau=pt.constant(tau),
             initval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
@@ -143,8 +143,8 @@ def mv_simple_very_coarse():
     with pm.Model() as model:
         pm.MvNormal(
             "x",
-            at.constant(mu),
-            tau=at.constant(tau),
+            pt.constant(mu),
+            tau=pt.constant(tau),
             initval=floatX_array([0.1, 1.0, 0.8]),
         )
     H = tau
@@ -157,7 +157,7 @@ def mv_simple_discrete():
     n = 5
     p = floatX_array([0.15, 0.85])
     with pm.Model() as model:
-        pm.Multinomial("x", n, at.constant(p), initval=np.array([1, 4]))
+        pm.Multinomial("x", n, pt.constant(p), initval=np.array([1, 4]))
         mu = n * p
         # covariance matrix
         C = np.zeros((d, d))
diff --git a/tests/ode/test_ode.py b/tests/ode/test_ode.py
index 599ed256de..613572fd90 100644
--- a/tests/ode/test_ode.py
+++ b/tests/ode/test_ode.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from scipy.stats import norm
@@ -23,7 +23,7 @@
 import pymc as pm
 
 from pymc.ode import DifferentialEquation
-from tests.helpers import fast_unstable_sampling_mode
+from pymc.testing import fast_unstable_sampling_mode
 
 
 def test_simulate():
@@ -112,13 +112,13 @@ def ode_func_4(y, t, p):
         np.testing.assert_array_equal(model4_sens_ic, model4._sens_ic)
 
     def test_sens_ic_vector_2_param_tensor(self):
-        # Vector ODE 2 Param with return type at.TensorVariable
+        # Vector ODE 2 Param with return type pt.TensorVariable
         def ode_func_4_t(y, t, p):
             # Make sure that ds and di are vectors by slicing
             ds = -p[0:1] * y[0:1] * y[1:]
             di = p[0:1] * y[0:1] * y[1:] - p[1:] * y[1:]
 
-            return at.concatenate([ds, di], axis=0)
+            return pt.concatenate([ds, di], axis=0)
 
         # Instantiate ODE model
         model4_t = DifferentialEquation(
@@ -240,7 +240,7 @@ def system_2d_tensor(y, t, p):
                 s1 = np.exp(-t) - p[0] * y[1]
                 s2 = np.exp(-t) - p[0] * y[2]
                 s3 = np.exp(-t) - p[0] * y[3]
-                return at.stack((s0, s1, s2, s3)).reshape((2, 2))
+                return pt.stack((s0, s1, s2, s3)).reshape((2, 2))
 
             DifferentialEquation(
                 func=system_2d_tensor, t0=0, times=self.times, n_states=4, n_theta=1
diff --git a/tests/sampler_fixtures.py b/tests/sampler_fixtures.py
index 9a7f1aead3..549ed606e3 100644
--- a/tests/sampler_fixtures.py
+++ b/tests/sampler_fixtures.py
@@ -14,15 +14,15 @@
 import arviz as az
 import numpy as np
 import numpy.testing as npt
-import pytensor.tensor as at
+import pytensor.tensor as pt
 
 from scipy import stats
 
 import pymc as pm
 
 from pymc.backends.arviz import to_inference_data
+from pymc.testing import SeededTest
 from pymc.util import get_var_name
-from tests.helpers import SeededTest
 
 
 class KnownMean:
@@ -126,9 +126,9 @@ def make_model(cls):
                 "chol_packed", eta=3, n=5, sd_dist=sd_dist, compute_corr=False
             )
             chol = pm.expand_packed_triangular(5, chol_packed, lower=True)
-            cov = at.dot(chol, chol.T)
-            stds = at.sqrt(at.diag(cov))
-            pm.Deterministic("log_stds", at.log(stds))
+            cov = pt.dot(chol, chol.T)
+            stds = pt.sqrt(pt.diag(cov))
+            pm.Deterministic("log_stds", pt.log(stds))
             corr = cov / stds[None, :] / stds[:, None]
             corr_entries_unit = (corr[np.tril_indices(5, -1)] + 1) / 2
             pm.Deterministic("corr_entries_unit", corr_entries_unit)
diff --git a/tests/sampling/test_forward.py b/tests/sampling/test_forward.py
index 751cc9a699..e86243eef6 100644
--- a/tests/sampling/test_forward.py
+++ b/tests/sampling/test_forward.py
@@ -20,7 +20,7 @@
 import numpy.random as npr
 import numpy.testing as npt
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import xarray as xr
 
@@ -40,7 +40,7 @@
     get_vars_in_point_list,
     observed_dependent_deterministics,
 )
-from tests.helpers import SeededTest, fast_unstable_sampling_mode
+from pymc.testing import SeededTest, fast_unstable_sampling_mode
 
 
 class TestDraw(SeededTest):
@@ -333,9 +333,9 @@ def test_non_random_model_variable(self):
             # A user may register non-pure RandomVariables that can nevertheless be
             # sampled, as long as a custom logprob is dispatched or we can infer
             # its logprob (which is the case for `clip`)
-            y = at.clip(pm.Normal.dist(), -1, 1)
+            y = pt.clip(pm.Normal.dist(), -1, 1)
             y = model.register_rv(y, name="y")
-            y_abs = pm.Deterministic("y_abs", at.abs(y))
+            y_abs = pm.Deterministic("y_abs", pt.abs(y))
             obs = pm.Normal("obs", y_abs, observed=np.zeros(10))
 
         # y_abs should be resampled even if in the trace, because the source y is missing
@@ -1096,7 +1096,7 @@ def test_transformed(self):
             phi = pm.Beta("phi", alpha=1.0, beta=1.0)
 
             kappa_log = pm.Exponential("logkappa", lam=5.0)
-            kappa = pm.Deterministic("kappa", at.exp(kappa_log))
+            kappa = pm.Deterministic("kappa", pt.exp(kappa_log))
 
             thetas = pm.Beta("thetas", alpha=phi * kappa, beta=(1.0 - phi) * kappa, size=n)
 
diff --git a/tests/sampling/test_jax.py b/tests/sampling/test_jax.py
index 485d7bf69c..68755721c0 100644
--- a/tests/sampling/test_jax.py
+++ b/tests/sampling/test_jax.py
@@ -20,7 +20,7 @@
 import jax
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from numpyro.infer import MCMC
@@ -135,8 +135,8 @@ def test_deterministic_samples(sampler):
 def test_get_jaxified_graph():
     # Check that jaxifying a graph does not emit the Supervisor Warning. This test can
     # be removed once https://github.com/pytensor-devs/pytensor/issues/637 is sorted.
-    x = at.scalar("x")
-    y = at.exp(x)
+    x = pt.scalar("x")
+    y = pt.exp(x)
     with warnings.catch_warnings():
         warnings.simplefilter("error")
         fn = get_jaxified_graph(inputs=[x], outputs=[y])
@@ -190,7 +190,7 @@ def test_get_jaxified_logp():
     with pm.Model() as m:
         x = pm.Flat("x")
         y = pm.Flat("y")
-        pm.Potential("pot", at.log(at.exp(x) + at.exp(y)))
+        pm.Potential("pot", pt.log(pt.exp(x) + pt.exp(y)))
 
     jax_fn = get_jaxified_logp(m)
     # This would underflow if not optimized
diff --git a/tests/sampling/test_mcmc.py b/tests/sampling/test_mcmc.py
index 8f7e06fb5c..6dff413179 100644
--- a/tests/sampling/test_mcmc.py
+++ b/tests/sampling/test_mcmc.py
@@ -21,7 +21,7 @@
 import numpy as np
 import numpy.testing as npt
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.special
 
@@ -45,7 +45,7 @@
     Metropolis,
     Slice,
 )
-from tests.helpers import SeededTest, fast_unstable_sampling_mode
+from pymc.testing import SeededTest, fast_unstable_sampling_mode
 from tests.models import simple_init
 
 
@@ -534,7 +534,7 @@ def test_shared_named(self):
                 initval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=at.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
+                "theta", mu=pt.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
             )
             res = theta.eval()
             assert np.isclose(res, 0.0)
@@ -550,13 +550,13 @@ def test_shared_unnamed(self):
                 initval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=at.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
+                "theta", mu=pt.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
             )
             res = theta.eval()
             assert np.isclose(res, 0.0)
 
     def test_constant_named(self):
-        G_var = at.constant(np.atleast_2d(1.0), name="G")
+        G_var = pt.constant(np.atleast_2d(1.0), name="G")
         with pm.Model():
             theta0 = pm.Normal(
                 "theta0",
@@ -566,7 +566,7 @@ def test_constant_named(self):
                 initval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=at.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
+                "theta", mu=pt.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
             )
 
             res = theta.eval()
@@ -744,8 +744,8 @@ def test_normal_nograd_op(self):
 
             # a custom PyTensor Op that does not have a grad:
             is_64 = pytensor.config.floatX == "float64"
-            itypes = [at.dscalar] if is_64 else [at.fscalar]
-            otypes = [at.dscalar] if is_64 else [at.fscalar]
+            itypes = [pt.dscalar] if is_64 else [pt.fscalar]
+            otypes = [pt.dscalar] if is_64 else [pt.fscalar]
 
             @as_op(itypes, otypes)
             def kill_grad(x):
diff --git a/tests/sampling/test_mcmc_external.py b/tests/sampling/test_mcmc_external.py
index 2439738955..3c86154c14 100644
--- a/tests/sampling/test_mcmc_external.py
+++ b/tests/sampling/test_mcmc_external.py
@@ -13,13 +13,11 @@
 #   limitations under the License.
 
 import numpy as np
+import numpy.testing as npt
 import pytest
 
 from pymc import Model, Normal, sample
 
-# turns all warnings into errors for this module
-pytestmark = pytest.mark.filterwarnings("error")
-
 
 @pytest.mark.parametrize("nuts_sampler", ["pymc", "nutpie", "blackjax", "numpyro"])
 def test_external_nuts_sampler(recwarn, nuts_sampler):
@@ -63,3 +61,16 @@ def test_external_nuts_sampler(recwarn, nuts_sampler):
     assert idata1.posterior.chain.size == 2
     assert idata1.posterior.draw.size == 500
     np.testing.assert_array_equal(idata1.posterior.x, idata2.posterior.x)
+
+
+def test_step_args():
+    with Model() as model:
+        a = Normal("a")
+        idata = sample(
+            nuts_sampler="numpyro",
+            target_accept=0.5,
+            nuts={"max_treedepth": 10},
+            random_seed=1410,
+        )
+
+    npt.assert_almost_equal(idata.sample_stats.acceptance_rate.mean(), 0.5, decimal=1)
diff --git a/tests/sampling/test_parallel.py b/tests/sampling/test_parallel.py
index c41233aa1e..2b56882f3c 100644
--- a/tests/sampling/test_parallel.py
+++ b/tests/sampling/test_parallel.py
@@ -20,7 +20,7 @@
 import cloudpickle
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from pytensor.compile.ops import as_op
@@ -72,7 +72,7 @@ def test_bad_unpickle():
 at_vector = TensorType(pytensor.config.floatX, [False])
 
 
-@as_op([at_vector, at.iscalar], [at_vector])
+@as_op([at_vector, pt.iscalar], [at_vector])
 def _crash_remote_process(a, master_pid):
     if os.getpid() != master_pid:
         sys.exit(0)
@@ -83,7 +83,7 @@ def test_remote_pipe_closed():
     master_pid = os.getpid()
     with pm.Model():
         x = pm.Normal("x", shape=2, mu=0.1)
-        at_pid = at.as_tensor_variable(np.array(master_pid, dtype="int32"))
+        at_pid = pt.as_tensor_variable(np.array(master_pid, dtype="int32"))
         pm.Normal("y", mu=_crash_remote_process(x, at_pid), shape=2)
 
         step = pm.Metropolis()
diff --git a/tests/smc/test_smc.py b/tests/smc/test_smc.py
index c606031c86..5477498360 100644
--- a/tests/smc/test_smc.py
+++ b/tests/smc/test_smc.py
@@ -15,7 +15,7 @@
 import warnings
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.stats as st
 
@@ -26,7 +26,8 @@
 from pymc.backends.base import MultiTrace
 from pymc.pytensorf import floatX
 from pymc.smc.kernels import IMH, systematic_resampling
-from tests.helpers import SeededTest, assert_random_state_equal
+from pymc.testing import SeededTest
+from tests.helpers import assert_random_state_equal
 
 
 class TestSMC(SeededTest):
@@ -52,16 +53,16 @@ def two_gaussians(x):
             Mixture of gaussians likelihood
             """
             log_like1 = (
-                -0.5 * n * at.log(2 * np.pi)
-                - 0.5 * at.log(dsigma)
+                -0.5 * n * pt.log(2 * np.pi)
+                - 0.5 * pt.log(dsigma)
                 - 0.5 * (x - mu1).T.dot(isigma).dot(x - mu1)
             )
             log_like2 = (
-                -0.5 * n * at.log(2 * np.pi)
-                - 0.5 * at.log(dsigma)
+                -0.5 * n * pt.log(2 * np.pi)
+                - 0.5 * pt.log(dsigma)
                 - 0.5 * (x - mu2).T.dot(isigma).dot(x - mu2)
             )
-            return at.log(w1 * at.exp(log_like1) + w2 * at.exp(log_like2))
+            return pt.log(w1 * pt.exp(log_like1) + w2 * pt.exp(log_like2))
 
         with pm.Model() as self.SMC_test:
             X = pm.Uniform("X", lower=-2, upper=2.0, shape=n)
diff --git a/tests/stats/test_convergence.py b/tests/stats/test_convergence.py
index 7dba129a37..b5e99a09ff 100644
--- a/tests/stats/test_convergence.py
+++ b/tests/stats/test_convergence.py
@@ -31,6 +31,17 @@ def test_warn_divergences():
     assert "2 divergences after tuning" in warns[0].message
 
 
+def test_warn_treedepth():
+    idata = arviz.from_dict(
+        sample_stats={
+            "reached_max_treedepth": np.array([[0, 0, 0], [0, 1, 0]]).astype(bool),
+        }
+    )
+    warns = convergence.warn_treedepth(idata)
+    assert len(warns) == 1
+    assert "Chain 1 reached the maximum tree depth" in warns[0].message
+
+
 def test_log_warning_stats(caplog):
     s1 = dict(warning="Temperature too low!")
     s2 = dict(warning="Temperature too high!")
diff --git a/tests/step_methods/hmc/test_nuts.py b/tests/step_methods/hmc/test_nuts.py
index 70f9a589f9..35090f7358 100644
--- a/tests/step_methods/hmc/test_nuts.py
+++ b/tests/step_methods/hmc/test_nuts.py
@@ -17,7 +17,7 @@
 import warnings
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 import pymc as pm
@@ -125,8 +125,8 @@ def test_bad_init_parallel(self):
     def test_emits_energy_warnings(self, caplog):
         with pm.Model():
             a = pm.Normal("a", size=2, initval=floatX(np.zeros(2)))
-            a = at.switch(a > 0, np.inf, a)
-            b = at.slinalg.solve(floatX(np.eye(2)), a, check_finite=False)
+            a = pt.switch(a > 0, np.inf, a)
+            b = pt.slinalg.solve(floatX(np.eye(2)), a, check_finite=False)
             pm.Normal("c", mu=b, size=2, initval=floatX(np.r_[0.0, 0.0]))
             caplog.clear()
             # The logger name must be specified for DEBUG level capturing to work
diff --git a/tests/step_methods/test_compound.py b/tests/step_methods/test_compound.py
index ba9d90634d..93cbecbc0d 100644
--- a/tests/step_methods/test_compound.py
+++ b/tests/step_methods/test_compound.py
@@ -31,7 +31,8 @@
     get_stats_dtypes_shapes_from_steps,
     infer_warn_stats_info,
 )
-from tests.helpers import StepMethodTester, fast_unstable_sampling_mode
+from pymc.testing import fast_unstable_sampling_mode
+from tests.helpers import StepMethodTester
 from tests.models import simple_2model_continuous
 
 
diff --git a/tests/step_methods/test_metropolis.py b/tests/step_methods/test_metropolis.py
index cee1828408..8da8faac48 100644
--- a/tests/step_methods/test_metropolis.py
+++ b/tests/step_methods/test_metropolis.py
@@ -31,12 +31,9 @@
     MultivariateNormalProposal,
     NormalProposal,
 )
+from pymc.testing import fast_unstable_sampling_mode
 from tests import sampler_fixtures as sf
-from tests.helpers import (
-    RVsAssignmentStepsTester,
-    StepMethodTester,
-    fast_unstable_sampling_mode,
-)
+from tests.helpers import RVsAssignmentStepsTester, StepMethodTester
 from tests.models import mv_simple, mv_simple_discrete, simple_categorical
 
 
diff --git a/tests/test_data.py b/tests/test_data.py
index 09d175de4b..6db3b50875 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -18,7 +18,7 @@
 import cloudpickle
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from pytensor import shared
@@ -28,7 +28,7 @@
 
 from pymc.data import is_minibatch
 from pymc.pytensorf import GeneratorOp, floatX
-from tests.helpers import SeededTest
+from pymc.testing import SeededTest
 
 
 class TestData(SeededTest):
@@ -318,8 +318,8 @@ def test_explicit_coords(self):
         # pass coordinates explicitly, use numpy array in Data container
         with pm.Model(coords=coords) as pmodel:
             # Dims created from coords are constant by default
-            assert isinstance(pmodel.dim_lengths["rows"], at.TensorConstant)
-            assert isinstance(pmodel.dim_lengths["columns"], at.TensorConstant)
+            assert isinstance(pmodel.dim_lengths["rows"], pt.TensorConstant)
+            assert isinstance(pmodel.dim_lengths["columns"], pt.TensorConstant)
             pm.MutableData("observations", data, dims=("rows", "columns"))
             # new data with same (!) shape
             pm.set_data({"observations": data + 1})
@@ -433,7 +433,7 @@ def test_data_mutable_default_warning(self):
         with pm.Model():
             with pytest.warns(UserWarning, match="`mutable` kwarg was not specified"):
                 data = pm.Data("x", [1, 2, 3])
-            assert isinstance(data, at.TensorConstant)
+            assert isinstance(data, pt.TensorConstant)
         pass
 
 
@@ -564,7 +564,7 @@ def test_pickling(self, datagen):
 
     def test_gen_cloning_with_shape_change(self, datagen):
         gen = pm.generator(datagen)
-        gen_r = at.random.normal(size=gen.shape).T
+        gen_r = pt.random.normal(size=gen.shape).T
         X = gen.dot(gen_r)
         res, _ = pytensor.scan(lambda x: x.sum(), X, n_steps=X.shape[0])
         assert res.eval().shape == (50,)
@@ -597,16 +597,16 @@ def test_1d(self):
         assert mb.eval().shape == (20, 10)
 
     def test_allowed(self):
-        mb = pm.Minibatch(at.as_tensor(self.data).astype(int), batch_size=20)
+        mb = pm.Minibatch(pt.as_tensor(self.data).astype(int), batch_size=20)
         assert is_minibatch(mb)
 
     def test_not_allowed(self):
         with pytest.raises(ValueError, match="not valid for Minibatch"):
-            mb = pm.Minibatch(at.as_tensor(self.data) * 2, batch_size=20)
+            mb = pm.Minibatch(pt.as_tensor(self.data) * 2, batch_size=20)
 
     def test_not_allowed2(self):
         with pytest.raises(ValueError, match="not valid for Minibatch"):
-            mb = pm.Minibatch(self.data, at.as_tensor(self.data) * 2, batch_size=20)
+            mb = pm.Minibatch(self.data, pt.as_tensor(self.data) * 2, batch_size=20)
 
     def test_assert(self):
         with pytest.raises(
diff --git a/tests/test_initial_point.py b/tests/test_initial_point.py
index 764e38a296..21511b964d 100644
--- a/tests/test_initial_point.py
+++ b/tests/test_initial_point.py
@@ -14,7 +14,7 @@
 import cloudpickle
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from pytensor.tensor.random.op import RandomVariable
@@ -182,7 +182,7 @@ def test_respects_overrides(self):
             jitter_rvs={},
             return_transformed=True,
             overrides={
-                A: at.as_tensor(2, dtype=int),
+                A: pt.as_tensor(2, dtype=int),
                 B: 3,
                 C: 5,
             },
@@ -238,7 +238,7 @@ def test_numeric_moment_shape(self, rv_cls):
 
     @pytest.mark.parametrize("rv_cls", [pm.Flat, pm.HalfFlat])
     def test_symbolic_moment_shape(self, rv_cls):
-        s = at.scalar(dtype="int64")
+        s = pt.scalar(dtype="int64")
         rv = rv_cls.dist(shape=(s,))
         assert not hasattr(rv.tag, "test_value")
         assert tuple(moment(rv).shape.eval({s: 4})) == (4,)
diff --git a/tests/test_math.py b/tests/test_math.py
index 3d2120a5f7..c5aed9246c 100644
--- a/tests/test_math.py
+++ b/tests/test_math.py
@@ -17,7 +17,7 @@
 import numpy as np
 import numpy.testing as npt
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from pymc.math import (
@@ -39,7 +39,8 @@
     softmax,
 )
 from pymc.pytensorf import floatX
-from tests.helpers import SeededTest, verify_grad
+from pymc.testing import SeededTest
+from tests.helpers import verify_grad
 
 
 def test_kronecker():
@@ -48,7 +49,7 @@ def test_kronecker():
     [a, b, c] = [np.random.rand(3, 3 + i) for i in range(3)]
 
     custom = kronecker(a, b, c)  # Custom version
-    nested = at.slinalg.kron(a, at.slinalg.kron(b, c))
+    nested = pt.slinalg.kron(a, pt.slinalg.kron(b, c))
     np.testing.assert_array_almost_equal(custom.eval(), nested.eval())  # Standard nested version
 
 
@@ -103,7 +104,7 @@ def test_kron_dot():
     x = np.random.rand(tot_size).reshape((tot_size, 1))
     # Construct entire kronecker product then multiply
     big = kronecker(*Ks)
-    slow_ans = at.dot(big, x)
+    slow_ans = pt.dot(big, x)
     # Use tricks to avoid construction of entire kronecker product
     fast_ans = kron_dot(Ks, x)
     np.testing.assert_array_almost_equal(slow_ans.eval(), fast_ans.eval())
@@ -118,7 +119,7 @@ def test_kron_solve_lower():
     x = np.random.rand(tot_size).reshape((tot_size, 1))
     # Construct entire kronecker product then solve
     big = kronecker(*Ls)
-    slow_ans = at.slinalg.solve_triangular(big, x, lower=True)
+    slow_ans = pt.slinalg.solve_triangular(big, x, lower=True)
     # Use tricks to avoid construction of entire kronecker product
     fast_ans = kron_solve_lower(Ls, x)
     np.testing.assert_array_almost_equal(slow_ans.eval(), fast_ans.eval())
@@ -146,7 +147,7 @@ def test_log1mexp():
             0.0,
         ]
     )
-    actual = at.log1mexp(-vals).eval()
+    actual = pt.log1mexp(-vals).eval()
     npt.assert_allclose(actual, expected)
     with warnings.catch_warnings():
         warnings.filterwarnings("ignore", "divide by zero encountered in log", RuntimeWarning)
@@ -165,7 +166,7 @@ def test_log1mexp_numpy_no_warning():
 
 
 def test_log1mexp_numpy_integer_input():
-    assert np.isclose(log1mexp_numpy(-2, negative_input=True), at.log1mexp(-2).eval())
+    assert np.isclose(log1mexp_numpy(-2, negative_input=True), pt.log1mexp(-2).eval())
 
 
 def test_log1mexp_deprecation_warnings():
@@ -239,11 +240,11 @@ def test_basic(self):
 
 def test_expand_packed_triangular():
     with pytest.raises(ValueError):
-        x = at.matrix("x")
+        x = pt.matrix("x")
         x.tag.test_value = np.array([[1.0]], dtype=pytensor.config.floatX)
         expand_packed_triangular(5, x)
     N = 5
-    packed = at.vector("packed")
+    packed = pt.vector("packed")
     packed.tag.test_value = floatX(np.zeros(N * (N + 1) // 2))
     with pytest.raises(TypeError):
         expand_packed_triangular(packed.shape[0], packed)
@@ -280,14 +281,14 @@ def test_invlogit_deprecation_warning():
 @pytest.mark.parametrize(
     "pytensor_function, pymc_wrapper",
     [
-        (at.special.softmax, softmax),
-        (at.special.log_softmax, log_softmax),
+        (pt.special.softmax, softmax),
+        (pt.special.log_softmax, log_softmax),
     ],
 )
 def test_softmax_logsoftmax_no_warnings(pytensor_function, pymc_wrapper):
     """Test that wrappers for pytensor functions do not issue Warnings"""
 
-    vector = at.vector("vector")
+    vector = pt.vector("vector")
     with pytest.warns(Warning) as record:
         pytensor_function(vector)
     assert {w.category for w in record.list} == {UserWarning, FutureWarning}
diff --git a/tests/test_model.py b/tests/test_model.py
index 2589e23c4c..c6b176af96 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -24,12 +24,13 @@
 import numpy.testing as npt
 import pytensor
 import pytensor.sparse as sparse
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.sparse as sps
 import scipy.stats as st
 
 from pytensor.graph import graph_inputs
+from pytensor.raise_op import Assert, assert_op
 from pytensor.tensor import TensorVariable
 from pytensor.tensor.random.op import RandomVariable
 from pytensor.tensor.sharedvar import ScalarSharedVariable
@@ -42,12 +43,12 @@
 from pymc.distributions import Normal, transforms
 from pymc.distributions.transforms import log
 from pymc.exceptions import ImputationWarning, ShapeError, ShapeWarning
-from pymc.logprob.joint_logprob import joint_logp
+from pymc.logprob.basic import joint_logp
 from pymc.logprob.transforms import IntervalTransform
 from pymc.model import Point, ValueGradFunction, modelcontext
+from pymc.testing import SeededTest
 from pymc.util import _FutureWarningValidatingScratchpad
 from pymc.variational.minibatch_rv import MinibatchRandomVariable
-from tests.helpers import SeededTest
 from tests.models import simple_model
 
 
@@ -60,8 +61,8 @@ def __init__(self, name="", model=None):
         self.v2 = pm.Normal("v2", mu=0, sigma=1)
         # 2) Potentials and Deterministic variables with method too
         # be sure that names will not overlap with other same models
-        pm.Deterministic("d", at.constant(1))
-        pm.Potential("p", at.constant(1))
+        pm.Deterministic("d", pt.constant(1))
+        pm.Potential("p", pt.constant(1))
 
 
 class DocstringModel(pm.Model):
@@ -71,7 +72,7 @@ def __init__(self, mean=0, sigma=1, name="", model=None):
         Normal("v2", mu=mean, sigma=sigma)
         Normal("v3", mu=mean, sigma=Normal("sigma", mu=10, sigma=1, initval=1.0))
         Deterministic("v3_sq", self.v3**2)
-        Potential("p1", at.constant(1))
+        Potential("p1", pt.constant(1))
 
 
 class TestBaseModel:
@@ -258,13 +259,13 @@ def test_empty_observed():
 
 class TestValueGradFunction(unittest.TestCase):
     def test_no_extra(self):
-        a = at.vector("a")
+        a = pt.vector("a")
         a.tag.test_value = np.zeros(3, dtype=a.dtype)
         f_grad = ValueGradFunction([a.sum()], [a], {}, mode="FAST_COMPILE")
         assert f_grad._extra_vars == []
 
     def test_invalid_type(self):
-        a = at.ivector("a")
+        a = pt.ivector("a")
         a.tag.test_value = np.zeros(3, dtype=a.dtype)
         a.dshape = (3,)
         a.dsize = 3
@@ -273,17 +274,17 @@ def test_invalid_type(self):
         err.match("Invalid dtype")
 
     def setUp(self):
-        extra1 = at.iscalar("extra1")
+        extra1 = pt.iscalar("extra1")
         extra1_ = np.array(0, dtype=extra1.dtype)
         extra1.dshape = tuple()
         extra1.dsize = 1
 
-        val1 = at.vector("val1")
+        val1 = pt.vector("val1")
         val1_ = np.zeros(3, dtype=val1.dtype)
         val1.dshape = (3,)
         val1.dsize = 3
 
-        val2 = at.matrix("val2")
+        val2 = pt.matrix("val2")
         val2_ = np.zeros((2, 3), dtype=val2.dtype)
         val2.dshape = (2, 3)
         val2.dsize = 6
@@ -360,10 +361,10 @@ def test_missing_data(self):
         # The dtype of the merged observed/missing deterministic should match the RV dtype
         assert m.deterministics[0].type.dtype == x2.type.dtype
 
-        pnt = m.initial_point(random_seed=None).copy()
-        del pnt["x2_missing"]
+        point = m.initial_point(random_seed=None).copy()
+        del point["x2_missing"]
 
-        res = [gf(DictToArrayBijection.map(Point(pnt, model=m))) for i in range(5)]
+        res = [gf(DictToArrayBijection.map(Point(point, model=m))) for i in range(5)]
 
         # Assert that all the elements of res are equal
         assert res[1:] == res[:-1]
@@ -418,7 +419,7 @@ def test_tempered_logp_dlogp():
     with pm.Model() as model:
         pm.Normal("x")
         pm.Normal("y", observed=1)
-        pm.Potential("z", at.constant(-1.0, dtype=pytensor.config.floatX))
+        pm.Potential("z", pt.constant(-1.0, dtype=pytensor.config.floatX))
 
     func = model.logp_dlogp_function()
     func.set_extra_values({})
@@ -634,7 +635,7 @@ def test_eval_rv_shapes(self):
             pm.Normal("untransformed", size=(1, 2))
             pm.Uniform("transformed", size=(7,))
             obs = pm.Uniform("observed", size=(3,), observed=[0.1, 0.2, 0.3])
-            pm.LogNormal("lognorm", mu=at.log(obs))
+            pm.LogNormal("lognorm", mu=pt.log(obs))
             pm.Normal("from_dims", dims=("city", "year"))
         shapes = pmodel.eval_rv_shapes()
         assert shapes["untransformed"] == (1, 2)
@@ -714,7 +715,7 @@ def test_datalogp_multiple_shapes():
     with pm.Model() as m:
         x = pm.Normal("x", 0, 1)
         z1 = pm.Potential("z1", x)
-        z2 = pm.Potential("z2", at.full((1, 3), x))
+        z2 = pm.Potential("z2", pt.full((1, 3), x))
         y1 = pm.Normal("y1", x, 1, observed=np.array([1]))
         y2 = pm.Normal("y2", x, 1, observed=np.array([1, 2]))
         y3 = pm.Normal("y3", x, 1, observed=np.array([1, 2, 3]))
@@ -1062,6 +1063,26 @@ def test_deterministic():
     assert model["y"] == y
 
 
+def test_determinsitic_with_dims():
+    """
+    Test to check the passing of dims to the potential
+    """
+    with pm.Model(coords={"observed": range(10)}) as model:
+        x = pm.Normal("x", 0, 1)
+        y = pm.Deterministic("y", x**2, dims=("observed",))
+    assert model.named_vars_to_dims == {"y": ("observed",)}
+
+
+def test_potential_with_dims():
+    """
+    Test to check the passing of dims to the potential
+    """
+    with pm.Model(coords={"observed": range(10)}) as model:
+        x = pm.Normal("x", 0, 1)
+        y = pm.Potential("y", x**2, dims=("observed",))
+    assert model.named_vars_to_dims == {"y": ("observed",)}
+
+
 def test_empty_model_representation():
     assert pm.Model().str_repr() == ""
 
@@ -1493,7 +1514,7 @@ def test_tag_future_warning_model():
 
         model = pm.Model()
 
-        x = at.random.normal()
+        x = pt.random.normal()
         x.tag.something_else = "5"
         x.tag.test_value = 0
         assert not isinstance(x.tag, _FutureWarningValidatingScratchpad)
@@ -1533,3 +1554,74 @@ def test_tag_future_warning_model():
         assert y_value.eval() == 5
 
         assert isinstance(y_value.tag, _FutureWarningValidatingScratchpad)
+
+
+class TestModelDebug:
+    @pytest.mark.parametrize("fn", ("logp", "dlogp", "random"))
+    def test_no_problems(self, fn, capfd):
+        with pm.Model() as m:
+            x = pm.Normal("x", [1, -1, 1])
+        m.debug(fn=fn)
+
+        out, _ = capfd.readouterr()
+        assert out == "point={'x': array([ 1., -1.,  1.])}\n\nNo problems found\n"
+
+    @pytest.mark.parametrize("fn", ("logp", "dlogp", "random"))
+    def test_invalid_parameter(self, fn, capfd):
+        with pm.Model() as m:
+            x = pm.Normal("x", [1, -1, 1])
+            y = pm.HalfNormal("y", tau=x)
+        m.debug(fn=fn)
+
+        out, _ = capfd.readouterr()
+        if fn == "dlogp":
+            # var dlogp is 0 or 1 without a likelihood
+            assert "No problems found" in out
+        else:
+            assert "The parameters evaluate to:\n0: 0.0\n1: [ 1. -1.  1.]" in out
+            if fn == "logp":
+                assert "This does not respect one of the following constraints: sigma > 0" in out
+            else:
+                assert (
+                    "The variable y random method raised the following exception: Domain error in arguments."
+                    in out
+                )
+
+    @pytest.mark.parametrize("verbose", (True, False))
+    @pytest.mark.parametrize("fn", ("logp", "dlogp", "random"))
+    def test_invalid_parameter_cant_be_evaluated(self, fn, verbose, capfd):
+        with pm.Model() as m:
+            x = pm.Normal("x", [1, 1, 1])
+            sigma = Assert(msg="x > 0")(pm.math.abs(x), (x > 0).all())
+            y = pm.HalfNormal("y", sigma=sigma)
+        m.debug(point={"x": [-1, -1, -1], "y_log__": [0, 0, 0]}, fn=fn, verbose=verbose)
+
+        out, _ = capfd.readouterr()
+        assert "{'x': [-1, -1, -1], 'y_log__': [0, 0, 0]}" in out
+        assert "The parameters of the variable y cannot be evaluated: x > 0" in out
+        verbose_str = "Apply node that caused the error:" in out
+        assert verbose_str if verbose else not verbose_str
+
+    def test_invalid_value(self, capfd):
+        with pm.Model() as m:
+            x = pm.Normal("x", [1, -1, 1])
+            y = pm.HalfNormal("y", tau=pm.math.abs(x), initval=[-1, 1, -1], transform=None)
+        m.debug()
+
+        out, _ = capfd.readouterr()
+        assert "The parameters of the variable y evaluate to:\n0: array(0., dtype=float32)\n1: array([1., 1., 1.])]"
+        assert "Some of the values of variable y are associated with a non-finite logp" in out
+        assert "value = -1.0 -> logp = -inf" in out
+
+    def test_invalid_observed_value(self, capfd):
+        with pm.Model() as m:
+            theta = pm.Uniform("theta", lower=0, upper=1)
+            y = pm.Uniform("y", lower=0, upper=theta, observed=[0.49, 0.27, 0.53, 0.19])
+        m.debug()
+
+        out, _ = capfd.readouterr()
+        assert "The parameters of the variable y evaluate to:\n0: 0.0\n1: 0.5"
+        assert (
+            "Some of the observed values of variable y are associated with a non-finite logp" in out
+        )
+        assert "value = 0.53 -> logp = -inf" in out
diff --git a/tests/test_model_graph.py b/tests/test_model_graph.py
index 08733958b8..9c2e1caa37 100644
--- a/tests/test_model_graph.py
+++ b/tests/test_model_graph.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 from pytensor.compile.sharedvalue import SharedVariable
@@ -25,7 +25,7 @@
 
 from pymc.exceptions import ImputationWarning
 from pymc.model_graph import ModelGraph, model_to_graphviz, model_to_networkx
-from tests.helpers import SeededTest
+from pymc.testing import SeededTest
 
 
 def school_model():
@@ -343,7 +343,7 @@ class TestModelWithDims(BaseModelGraphTest):
 
     def test_issue_6335_dims_containing_none(self):
         with pm.Model(coords=dict(time=np.arange(5))) as pmodel:
-            data = at.as_tensor(np.ones((3, 5)))
+            data = pt.as_tensor(np.ones((3, 5)))
             pm.Deterministic("n", data, dims=(None, "time"))
 
         mg = ModelGraph(pmodel)
diff --git a/tests/test_printing.py b/tests/test_printing.py
index 62c360ec7f..64fd21eebf 100644
--- a/tests/test_printing.py
+++ b/tests/test_printing.py
@@ -132,63 +132,63 @@ def setup_class(self):
         self.formats = [("plain", True), ("plain", False), ("latex", True), ("latex", False)]
         self.expected = {
             ("plain", True): [
-                r"alpha ~ N(0, 10)",
-                r"sigma ~ N**+(0, 1)",
+                r"alpha ~ Normal(0, 10)",
+                r"sigma ~ HalfNormal(0, 1)",
                 r"mu ~ Deterministic(f(beta, alpha))",
-                r"beta ~ N(0, 10)",
-                r"Z ~ N(f(), f())",
-                r"nb_with_p_n ~ NB(10, nbp)",
-                r"zip ~ MarginalMixture(f(), DiracDelta(0), Pois(5))",
-                r"w ~ Dir(<constant>)",
+                r"beta ~ Normal(0, 10)",
+                r"Z ~ MultivariateNormal(f(), f())",
+                r"nb_with_p_n ~ NegativeBinomial(10, nbp)",
+                r"zip ~ MarginalMixture(f(), DiracDelta(0), Poisson(5))",
+                r"w ~ Dirichlet(<constant>)",
                 (
                     r"nested_mix ~ MarginalMixture(w, "
-                    r"MarginalMixture(f(), DiracDelta(0), Pois(5)), "
-                    r"Censored(Bern(0.5), -1, 1))"
+                    r"MarginalMixture(f(), DiracDelta(0), Poisson(5)), "
+                    r"Censored(Bernoulli(0.5), -1, 1))"
                 ),
-                r"Y_obs ~ N(mu, sigma)",
+                r"Y_obs ~ Normal(mu, sigma)",
                 r"pot ~ Potential(f(beta, alpha))",
             ],
             ("plain", False): [
-                r"alpha ~ N",
-                r"sigma ~ N**+",
+                r"alpha ~ Normal",
+                r"sigma ~ HalfNormal",
                 r"mu ~ Deterministic",
-                r"beta ~ N",
-                r"Z ~ N",
-                r"nb_with_p_n ~ NB",
+                r"beta ~ Normal",
+                r"Z ~ MultivariateNormal",
+                r"nb_with_p_n ~ NegativeBinomial",
                 r"zip ~ MarginalMixture",
-                r"w ~ Dir",
+                r"w ~ Dirichlet",
                 r"nested_mix ~ MarginalMixture",
-                r"Y_obs ~ N",
+                r"Y_obs ~ Normal",
                 r"pot ~ Potential",
             ],
             ("latex", True): [
-                r"$\text{alpha} \sim \operatorname{N}(0,~10)$",
-                r"$\text{sigma} \sim \operatorname{N^{+}}(0,~1)$",
+                r"$\text{alpha} \sim \operatorname{Normal}(0,~10)$",
+                r"$\text{sigma} \sim \operatorname{HalfNormal}(0,~1)$",
                 r"$\text{mu} \sim \operatorname{Deterministic}(f(\text{beta},~\text{alpha}))$",
-                r"$\text{beta} \sim \operatorname{N}(0,~10)$",
-                r"$\text{Z} \sim \operatorname{N}(f(),~f())$",
-                r"$\text{nb_with_p_n} \sim \operatorname{NB}(10,~\text{nbp})$",
-                r"$\text{zip} \sim \operatorname{MarginalMixture}(f(),~\operatorname{DiracDelta}(0),~\operatorname{Pois}(5))$",
-                r"$\text{w} \sim \operatorname{Dir}(\text{<constant>})$",
+                r"$\text{beta} \sim \operatorname{Normal}(0,~10)$",
+                r"$\text{Z} \sim \operatorname{MultivariateNormal}(f(),~f())$",
+                r"$\text{nb_with_p_n} \sim \operatorname{NegativeBinomial}(10,~\text{nbp})$",
+                r"$\text{zip} \sim \operatorname{MarginalMixture}(f(),~\operatorname{DiracDelta}(0),~\operatorname{Poisson}(5))$",
+                r"$\text{w} \sim \operatorname{Dirichlet}(\text{<constant>})$",
                 (
                     r"$\text{nested_mix} \sim \operatorname{MarginalMixture}(\text{w},"
-                    r"~\operatorname{MarginalMixture}(f(),~\operatorname{DiracDelta}(0),~\operatorname{Pois}(5)),"
-                    r"~\operatorname{Censored}(\operatorname{Bern}(0.5),~-1,~1))$"
+                    r"~\operatorname{MarginalMixture}(f(),~\operatorname{DiracDelta}(0),~\operatorname{Poisson}(5)),"
+                    r"~\operatorname{Censored}(\operatorname{Bernoulli}(0.5),~-1,~1))$"
                 ),
-                r"$\text{Y_obs} \sim \operatorname{N}(\text{mu},~\text{sigma})$",
+                r"$\text{Y_obs} \sim \operatorname{Normal}(\text{mu},~\text{sigma})$",
                 r"$\text{pot} \sim \operatorname{Potential}(f(\text{beta},~\text{alpha}))$",
             ],
             ("latex", False): [
-                r"$\text{alpha} \sim \operatorname{N}$",
-                r"$\text{sigma} \sim \operatorname{N^{+}}$",
+                r"$\text{alpha} \sim \operatorname{Normal}$",
+                r"$\text{sigma} \sim \operatorname{HalfNormal}$",
                 r"$\text{mu} \sim \operatorname{Deterministic}$",
-                r"$\text{beta} \sim \operatorname{N}$",
-                r"$\text{Z} \sim \operatorname{N}$",
-                r"$\text{nb_with_p_n} \sim \operatorname{NB}$",
+                r"$\text{beta} \sim \operatorname{Normal}$",
+                r"$\text{Z} \sim \operatorname{MultivariateNormal}$",
+                r"$\text{nb_with_p_n} \sim \operatorname{NegativeBinomial}$",
                 r"$\text{zip} \sim \operatorname{MarginalMixture}$",
-                r"$\text{w} \sim \operatorname{Dir}$",
+                r"$\text{w} \sim \operatorname{Dirichlet}$",
                 r"$\text{nested_mix} \sim \operatorname{MarginalMixture}$",
-                r"$\text{Y_obs} \sim \operatorname{N}$",
+                r"$\text{Y_obs} \sim \operatorname{Normal}$",
                 r"$\text{pot} \sim \operatorname{Potential}$",
             ],
         }
@@ -210,28 +210,28 @@ def setup_class(self):
         self.formats = [("plain", True), ("plain", False), ("latex", True), ("latex", False)]
         self.expected = {
             ("plain", True): [
-                r"a ~ N(2, 1)",
-                r"b ~ N(<shared>, 1)",
-                r"c ~ N(2, 1)",
-                r"d ~ N(<constant>, 1)",
+                r"a ~ Normal(2, 1)",
+                r"b ~ Normal(<shared>, 1)",
+                r"c ~ Normal(2, 1)",
+                r"d ~ Normal(<constant>, 1)",
             ],
             ("plain", False): [
-                r"a ~ N",
-                r"b ~ N",
-                r"c ~ N",
-                r"d ~ N",
+                r"a ~ Normal",
+                r"b ~ Normal",
+                r"c ~ Normal",
+                r"d ~ Normal",
             ],
             ("latex", True): [
-                r"$\text{a} \sim \operatorname{N}(2,~1)$",
-                r"$\text{b} \sim \operatorname{N}(\text{<shared>},~1)$",
-                r"$\text{c} \sim \operatorname{N}(2,~1)$",
-                r"$\text{d} \sim \operatorname{N}(\text{<constant>},~1)$",
+                r"$\text{a} \sim \operatorname{Normal}(2,~1)$",
+                r"$\text{b} \sim \operatorname{Normal}(\text{<shared>},~1)$",
+                r"$\text{c} \sim \operatorname{Normal}(2,~1)$",
+                r"$\text{d} \sim \operatorname{Normal}(\text{<constant>},~1)$",
             ],
             ("latex", False): [
-                r"$\text{a} \sim \operatorname{N}$",
-                r"$\text{b} \sim \operatorname{N}$",
-                r"$\text{c} \sim \operatorname{N}$",
-                r"$\text{d} \sim \operatorname{N}$",
+                r"$\text{a} \sim \operatorname{Normal}$",
+                r"$\text{b} \sim \operatorname{Normal}$",
+                r"$\text{c} \sim \operatorname{Normal}$",
+                r"$\text{d} \sim \operatorname{Normal}$",
             ],
         }
 
@@ -249,9 +249,9 @@ def test_model_latex_repr_three_levels_model():
     expected = [
         "$$",
         "\\begin{array}{rcl}",
-        "\\text{mu} &\\sim & \\operatorname{N}(0,~5)\\\\\\text{sigma} &\\sim & "
-        "\\operatorname{C^{+}}(0,~2.5)\\\\\\text{censored_normal} &\\sim & "
-        "\\operatorname{Censored}(\\operatorname{N}(\\text{mu},~\\text{sigma}),~-2,~2)",
+        "\\text{mu} &\\sim & \\operatorname{Normal}(0,~5)\\\\\\text{sigma} &\\sim & "
+        "\\operatorname{HalfCauchy}(0,~2.5)\\\\\\text{censored_normal} &\\sim & "
+        "\\operatorname{Censored}(\\operatorname{Normal}(\\text{mu},~\\text{sigma}),~-2,~2)",
         "\\end{array}",
         "$$",
     ]
@@ -268,8 +268,8 @@ def test_model_latex_repr_mixture_model():
         "$$",
         "\\begin{array}{rcl}",
         "\\text{w} &\\sim & "
-        "\\operatorname{Dir}(\\text{<constant>})\\\\\\text{mix} &\\sim & "
-        "\\operatorname{MarginalMixture}(\\text{w},~\\operatorname{N}(0,~5),~\\operatorname{StudentT}(7,~0,~1))",
+        "\\operatorname{Dirichlet}(\\text{<constant>})\\\\\\text{mix} &\\sim & "
+        "\\operatorname{MarginalMixture}(\\text{w},~\\operatorname{Normal}(0,~5),~\\operatorname{StudentT}(7,~0,~1))",
         "\\end{array}",
         "$$",
     ]
diff --git a/tests/test_pytensorf.py b/tests/test_pytensorf.py
index 3fe9440be3..009eb88619 100644
--- a/tests/test_pytensorf.py
+++ b/tests/test_pytensorf.py
@@ -11,6 +11,8 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
+import warnings
+
 from unittest import mock
 
 import numpy as np
@@ -18,7 +20,7 @@
 import numpy.testing as npt
 import pandas as pd
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 import scipy.sparse as sps
 
@@ -38,6 +40,7 @@
 from pymc.exceptions import NotConstantValueError
 from pymc.logprob.utils import ParameterValueError
 from pymc.pytensorf import (
+    collect_default_updates,
     compile_pymc,
     constant_fold,
     convert_observed_data,
@@ -48,8 +51,8 @@
     rvs_to_value_vars,
     walk_model,
 )
+from pymc.testing import assert_no_rvs
 from pymc.vartypes import int_types
-from tests.helpers import assert_no_rvs
 
 
 @pytest.mark.parametrize(
@@ -62,7 +65,7 @@
 )
 def test_pd_dataframe_as_tensor_variable(np_array: np.ndarray) -> None:
     df = pd.DataFrame(np_array)
-    np.testing.assert_array_equal(x=at.as_tensor_variable(x=df).eval(), y=np_array)
+    np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array)
 
 
 @pytest.mark.parametrize(
@@ -71,7 +74,7 @@ def test_pd_dataframe_as_tensor_variable(np_array: np.ndarray) -> None:
 )
 def test_pd_series_as_tensor_variable(np_array: np.ndarray) -> None:
     df = pd.Series(np_array)
-    np.testing.assert_array_equal(x=at.as_tensor_variable(x=df).eval(), y=np_array)
+    np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array)
 
 
 def test_pd_as_tensor_variable_multiindex() -> None:
@@ -82,7 +85,7 @@ def test_pd_as_tensor_variable_multiindex() -> None:
     df = pd.DataFrame({"A": [12.0, 80.0, 30.0, 20.0], "B": [120.0, 700.0, 30.0, 20.0]}, index=index)
     np_array = np.array([[12.0, 80.0, 30.0, 20.0], [120.0, 700.0, 30.0, 20.0]]).T
     assert isinstance(df.index, pd.MultiIndex)
-    np.testing.assert_array_equal(x=at.as_tensor_variable(x=df).eval(), y=np_array)
+    np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array)
 
 
 class TestBroadcasting:
@@ -135,10 +138,10 @@ def _make_along_axis_idx(arr_shape, indices, axis):
 
 def test_extract_obs_data():
     with pytest.raises(TypeError):
-        extract_obs_data(at.matrix())
+        extract_obs_data(pt.matrix())
 
     data = np.random.normal(size=(2, 3))
-    data_at = at.as_tensor(data)
+    data_at = pt.as_tensor(data)
     mask = np.random.binomial(1, 0.5, size=(2, 3)).astype(bool)
 
     for val_at in (data_at, pytensor.shared(data)):
@@ -150,8 +153,8 @@ def test_extract_obs_data():
     # AdvancedIncSubtensor check
     data_m = np.ma.MaskedArray(data, mask)
     missing_values = data_at.type()[mask]
-    constant = at.as_tensor(data_m.filled())
-    z_at = at.set_subtensor(constant[mask.nonzero()], missing_values)
+    constant = pt.as_tensor(data_m.filled())
+    z_at = pt.set_subtensor(constant[mask.nonzero()], missing_values)
 
     assert isinstance(z_at.owner.op, (AdvancedIncSubtensor, AdvancedIncSubtensor1))
 
@@ -162,13 +165,13 @@ def test_extract_obs_data():
 
     # AdvancedIncSubtensor1 check
     data = np.random.normal(size=(3,))
-    data_at = at.as_tensor(data)
+    data_at = pt.as_tensor(data)
     mask = np.random.binomial(1, 0.5, size=(3,)).astype(bool)
 
     data_m = np.ma.MaskedArray(data, mask)
     missing_values = data_at.type()[mask]
-    constant = at.as_tensor(data_m.filled())
-    z_at = at.set_subtensor(constant[mask.nonzero()], missing_values)
+    constant = pt.as_tensor(data_m.filled())
+    z_at = pt.set_subtensor(constant[mask.nonzero()], missing_values)
 
     assert isinstance(z_at.owner.op, (AdvancedIncSubtensor, AdvancedIncSubtensor1))
 
@@ -179,7 +182,7 @@ def test_extract_obs_data():
 
     # Cast check
     data = np.array(5)
-    t = at.cast(at.as_tensor(5.0), np.int64)
+    t = pt.cast(pt.as_tensor(5.0), np.int64)
     res = extract_obs_data(t)
 
     assert isinstance(res, np.ndarray)
@@ -197,7 +200,7 @@ def test_convert_observed_data(input_dtype):
     dense_input = np.arange(9).reshape((3, 3)).astype(input_dtype)
 
     input_name = "input_variable"
-    pytensor_graph_input = at.as_tensor(dense_input, name=input_name)
+    pytensor_graph_input = pt.as_tensor(dense_input, name=input_name)
     pandas_input = pd.DataFrame(dense_input)
 
     # All the even numbers are replaced with NaN
@@ -274,14 +277,14 @@ def test_pandas_to_array_pandas_index():
 
 
 def test_walk_model():
-    a = at.vector("a")
+    a = pt.vector("a")
     b = uniform(0.0, a, name="b")
-    c = at.log(b)
+    c = pt.log(b)
     c.name = "c"
-    d = at.vector("d")
+    d = pt.vector("d")
     e = normal(c, d, name="e")
 
-    test_graph = at.exp(e + 1)
+    test_graph = pt.exp(e + 1)
 
     res = list(walk_model((test_graph,)))
     assert a in res
@@ -308,7 +311,7 @@ def test_walk_model():
 class TestCompilePyMC:
     def test_check_bounds_flag(self):
         """Test that CheckParameterValue Ops are replaced or removed when using compile_pymc"""
-        logp = at.ones(3)
+        logp = pt.ones(3)
         cond = np.array([1, 0, 1])
         bound = check_parameters(logp, cond)
 
@@ -326,6 +329,21 @@ def test_check_bounds_flag(self):
         with m:
             assert np.all(compile_pymc([], bound)() == -np.inf)
 
+    def test_check_parameters_can_be_replaced_by_ninf(self):
+        expr = pt.vector("expr", shape=(3,))
+        cond = pt.ge(expr, 0)
+
+        final_expr = check_parameters(expr, cond, can_be_replaced_by_ninf=True)
+        fn = compile_pymc([expr], final_expr)
+        np.testing.assert_array_equal(fn(expr=[1, 2, 3]), [1, 2, 3])
+        np.testing.assert_array_equal(fn(expr=[-1, 2, 3]), [-np.inf, -np.inf, -np.inf])
+
+        final_expr = check_parameters(expr, cond, msg="test", can_be_replaced_by_ninf=False)
+        fn = compile_pymc([expr], final_expr)
+        np.testing.assert_array_equal(fn(expr=[1, 2, 3]), [1, 2, 3])
+        with pytest.raises(ParameterValueError, match="test"):
+            fn([-1, 2, 3])
+
     def test_compile_pymc_sets_rng_updates(self):
         rng = pytensor.shared(np.random.default_rng(0))
         x = pm.Normal.dist(rng=rng)
@@ -370,9 +388,9 @@ def test_compile_pymc_updates_inputs(self):
         """Test that compile_pymc does not include rngs updates of variables that are inputs
         or ancestors to inputs
         """
-        x = at.random.normal()
-        y = at.random.normal(x)
-        z = at.random.normal(y)
+        x = pt.random.normal()
+        y = pt.random.normal(x)
+        z = pt.random.normal(y)
 
         for inputs, rvs_in_graph in (
             ([], 3),
@@ -391,34 +409,69 @@ def test_compile_pymc_updates_inputs(self):
             # Each RV adds a shared output for its rng
             assert len(fn_fgraph.outputs) == 1 + rvs_in_graph
 
-    # Disable `reseed_rngs` so that we can test with simpler update rule
-    @mock.patch("pymc.pytensorf.reseed_rngs")
-    def test_compile_pymc_custom_update_op(self, _):
-        """Test that custom MeasurableVariable Op updates are used by compile_pymc"""
+    def test_compile_pymc_symbolic_rv_update(self):
+        """Test that SymbolicRandomVariable Op update methods are used by compile_pymc"""
 
         class NonSymbolicRV(OpFromGraph):
             def update(self, node):
-                return {node.inputs[0]: node.inputs[0] + 1}
+                return {node.inputs[0]: node.outputs[0]}
 
-        dummy_inputs = [at.scalar(), at.scalar()]
-        dummy_outputs = [at.add(*dummy_inputs)]
-        dummy_x = NonSymbolicRV(dummy_inputs, dummy_outputs)(pytensor.shared(1.0), 1.0)
+        rng = pytensor.shared(np.random.default_rng())
+        dummy_rng = rng.type()
+        dummy_next_rng, dummy_x = NonSymbolicRV(
+            [dummy_rng], pt.random.normal(rng=dummy_rng).owner.outputs
+        )(rng)
 
         # Check that there are no updates at first
         fn = compile_pymc(inputs=[], outputs=dummy_x)
-        assert fn() == fn() == 2.0
+        assert fn() == fn()
 
         # And they are enabled once the Op is registered as a SymbolicRV
         SymbolicRandomVariable.register(NonSymbolicRV)
-        fn = compile_pymc(inputs=[], outputs=dummy_x)
-        assert fn() == 2.0
-        assert fn() == 3.0
+        fn = compile_pymc(inputs=[], outputs=dummy_x, random_seed=431)
+        assert fn() != fn()
+
+    def test_compile_pymc_symbolic_rv_missing_update(self):
+        """Test that error is raised if SymbolicRandomVariable Op does not
+        provide rule for updating RNG"""
+
+        class SymbolicRV(OpFromGraph):
+            def update(self, node):
+                # Update is provided for rng1 but not rng2
+                return {node.inputs[0]: node.outputs[0]}
+
+        SymbolicRandomVariable.register(SymbolicRV)
+
+        # No problems at first, as the one RNG is given the update rule
+        rng1 = pytensor.shared(np.random.default_rng())
+        dummy_rng1 = rng1.type()
+        dummy_next_rng1, dummy_x1 = SymbolicRV(
+            [dummy_rng1],
+            pt.random.normal(rng=dummy_rng1).owner.outputs,
+        )(rng1)
+        fn = compile_pymc(inputs=[], outputs=dummy_x1, random_seed=433)
+        assert fn() != fn()
+
+        # Now there's a problem as there is no update rule for rng2
+        rng2 = pytensor.shared(np.random.default_rng())
+        dummy_rng2 = rng2.type()
+        dummy_next_rng1, dummy_x1, dummy_next_rng2, dummy_x2 = SymbolicRV(
+            [dummy_rng1, dummy_rng2],
+            [
+                *pt.random.normal(rng=dummy_rng1).owner.outputs,
+                *pt.random.normal(rng=dummy_rng2).owner.outputs,
+            ],
+        )(rng1, rng2)
+        with pytest.raises(
+            ValueError, match="No update mapping found for RNG used in SymbolicRandomVariable"
+        ):
+            compile_pymc(inputs=[], outputs=[dummy_x1, dummy_x2])
 
     def test_random_seed(self):
         seedx = pytensor.shared(np.random.default_rng(1))
         seedy = pytensor.shared(np.random.default_rng(1))
-        x = at.random.normal(rng=seedx)
-        y = at.random.normal(rng=seedy)
+        x = pt.random.normal(rng=seedx)
+        y = pt.random.normal(rng=seedy)
 
         # Shared variables are the same, so outputs will be identical
         f0 = pytensor.function([], [x, y])
@@ -442,20 +495,67 @@ def test_random_seed(self):
         assert y3_eval == y2_eval
 
     def test_multiple_updates_same_variable(self):
-        rng = pytensor.shared(np.random.default_rng(), name="rng")
-        x = at.random.normal(rng=rng)
-        y = at.random.normal(rng=rng)
+        # Raise if unexpected warning is issued
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+
+            rng = pytensor.shared(np.random.default_rng(), name="rng")
+            x = pt.random.normal(rng=rng)
+            y = pt.random.normal(rng=rng)
+
+            # No warnings if only one variable is used
+            assert compile_pymc([], [x])
+            assert compile_pymc([], [y])
+
+            user_warn_msg = "RNG Variable rng has multiple clients"
+            with pytest.warns(UserWarning, match=user_warn_msg):
+                f = compile_pymc([], [x, y], random_seed=456)
+            assert f() == f()
+
+            # The user can provide an explicit update, but we will still issue a warning
+            with pytest.warns(UserWarning, match=user_warn_msg):
+                f = compile_pymc([], [x, y], updates={rng: y.owner.outputs[0]}, random_seed=456)
+            assert f() != f()
+
+            # Same with default update
+            rng.default_update = x.owner.outputs[0]
+            with pytest.warns(UserWarning, match=user_warn_msg):
+                f = compile_pymc([], [x, y], updates={rng: y.owner.outputs[0]}, random_seed=456)
+            assert f() != f()
+
+    def test_nested_updates(self):
+        rng = pytensor.shared(np.random.default_rng())
+        next_rng1, x = pt.random.normal(rng=rng).owner.outputs
+        next_rng2, y = pt.random.normal(rng=next_rng1).owner.outputs
+        next_rng3, z = pt.random.normal(rng=next_rng2).owner.outputs
+
+        collect_default_updates([], [x, y, z]) == {rng: next_rng3}
+
+        fn = compile_pymc([], [x, y, z], random_seed=514)
+        assert not set(list(np.array(fn()))) & set(list(np.array(fn())))
+
+        # A local myopic rule (as PyMC used before, would not work properly)
+        fn = pytensor.function([], [x, y, z], updates={rng: next_rng1})
+        assert set(list(np.array(fn()))) & set(list(np.array(fn())))
+
+
+def test_collect_default_updates_must_be_shared():
+    shared_rng = pytensor.shared(np.random.default_rng())
+    nonshared_rng = shared_rng.type()
+
+    next_rng_of_shared, x = pt.random.normal(rng=shared_rng).owner.outputs
+    next_rng_of_nonshared, y = pt.random.normal(rng=nonshared_rng).owner.outputs
+
+    res = collect_default_updates(inputs=[nonshared_rng], outputs=[x, y])
+    assert res == {shared_rng: next_rng_of_shared}
 
-        assert compile_pymc([], [x])
-        assert compile_pymc([], [y])
-        msg = "Multiple update expressions found for the variable rng"
-        with pytest.raises(ValueError, match=msg):
-            compile_pymc([], [x, y])
+    res = collect_default_updates(inputs=[nonshared_rng], outputs=[x, y], must_be_shared=False)
+    assert res == {shared_rng: next_rng_of_shared, nonshared_rng: next_rng_of_nonshared}
 
 
 def test_replace_rng_nodes():
     rng = pytensor.shared(np.random.default_rng())
-    x = at.random.normal(rng=rng)
+    x = pt.random.normal(rng=rng)
     x_rng, *x_non_rng_inputs = x.owner.inputs
 
     cloned_x = x.owner.clone().default_output()
@@ -511,8 +611,8 @@ def test_reseed_rngs():
 
 
 def test_constant_fold():
-    x = at.random.normal(size=(5,))
-    y = at.arange(x.size)
+    x = pt.random.normal(size=(5,))
+    y = pt.arange(x.size)
 
     res = constant_fold((y, y.shape))
     assert np.array_equal(res[0], np.arange(5))
@@ -521,8 +621,8 @@ def test_constant_fold():
 
 def test_constant_fold_raises():
     size = pytensor.shared(5)
-    x = at.random.normal(size=(size,))
-    y = at.arange(x.size)
+    x = pt.random.normal(size=(size,))
+    y = pt.arange(x.size)
 
     with pytest.raises(NotConstantValueError):
         constant_fold((y, y.shape))
@@ -551,7 +651,7 @@ def test_basic(self, symbolic_rv, apply_transforms, test_deprecated_fn):
             else:
                 b = pm.Uniform("b", 0, a + 1.0, transform=interval)
             c = pm.Normal("c")
-            d = at.log(c + b) + 2.0
+            d = pt.log(c + b) + 2.0
 
         a_value_var = m.rvs_to_values[a]
         assert m.rvs_to_transforms[a] is not None
@@ -569,11 +669,11 @@ def test_basic(self, symbolic_rv, apply_transforms, test_deprecated_fn):
                 rvs_to_transforms=m.rvs_to_transforms,
             )
 
-        assert res.owner.op == at.add
+        assert res.owner.op == pt.add
         log_output = res.owner.inputs[0]
-        assert log_output.owner.op == at.log
+        assert log_output.owner.op == pt.log
         log_add_output = res.owner.inputs[0].owner.inputs[0]
-        assert log_add_output.owner.op == at.add
+        assert log_add_output.owner.op == pt.add
         c_output = log_add_output.owner.inputs[0]
 
         # We make sure that the random variables were replaced
@@ -624,12 +724,12 @@ def test_unvalued_rv(self, test_deprecated_fn):
                 rvs_to_transforms=m.rvs_to_transforms,
             )
 
-        assert res.owner.op == at.add
+        assert res.owner.op == pt.add
         assert res.owner.inputs[0] is z_value
         res_y = res.owner.inputs[1]
         # Graph should have be cloned, and therefore y and res_y should have different ids
         assert res_y is not y
-        assert res_y.owner.op == at.random.normal
+        assert res_y.owner.op == pt.random.normal
         assert res_y.owner.inputs[3] is x_value
 
     @pytest.mark.parametrize("test_deprecated_fn", (True, False))
@@ -638,7 +738,7 @@ def test_no_change_inplace(self, test_deprecated_fn):
         # does not change the original rvs in place. See issue #5172
         with pm.Model() as m:
             one = pm.LogNormal("one", mu=0)
-            two = pm.LogNormal("two", mu=at.log(one))
+            two = pm.LogNormal("two", mu=pt.log(one))
 
             # We add potentials or deterministics that are not in topological order
             pm.Potential("two_pot", two)
diff --git a/tests/test_testing.py b/tests/test_testing.py
new file mode 100644
index 0000000000..b23e97a1d2
--- /dev/null
+++ b/tests/test_testing.py
@@ -0,0 +1,34 @@
+#   Copyright 2023 The PyMC Developers
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+from contextlib import ExitStack as does_not_raise
+
+import pytest
+
+from pymc.testing import Domain
+
+
+@pytest.mark.parametrize(
+    "values, edges, expectation",
+    [
+        ([], None, pytest.raises(IndexError)),
+        ([], (0, 0), pytest.raises(ValueError)),
+        ([0], None, pytest.raises(ValueError)),
+        ([0], (0, 0), does_not_raise()),
+        ([-1, 1], None, pytest.raises(ValueError)),
+        ([-1, 0, 1], None, does_not_raise()),
+    ],
+)
+def test_domain(values, edges, expectation):
+    with expectation:
+        Domain(values, edges=edges)
diff --git a/tests/tuning/test_starting.py b/tests/tuning/test_starting.py
index cdff83a22e..4e7a3540ee 100644
--- a/tests/tuning/test_starting.py
+++ b/tests/tuning/test_starting.py
@@ -20,10 +20,10 @@
 
 from pymc.exceptions import ImputationWarning
 from pymc.step_methods.metropolis import tune
+from pymc.testing import select_by_precision
 from pymc.tuning import find_MAP
 from tests import models
 from tests.checks import close_to
-from tests.helpers import select_by_precision
 from tests.models import non_normal, simple_arbitrary_det, simple_model
 
 
diff --git a/tests/variational/test_inference.py b/tests/variational/test_inference.py
index 62d452f0de..f23d7cc4f7 100644
--- a/tests/variational/test_inference.py
+++ b/tests/variational/test_inference.py
@@ -18,7 +18,7 @@
 import cloudpickle
 import numpy as np
 import pytensor
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 import pymc as pm
@@ -287,7 +287,7 @@ def test_replacements(binomial_model_inference):
 
 
 def test_sample_replacements(binomial_model_inference):
-    i = at.iscalar()
+    i = pt.iscalar()
     i.tag.test_value = 1
     approx = binomial_model_inference.approx
     p = approx.model.p
diff --git a/tests/variational/test_minibatch_rv.py b/tests/variational/test_minibatch_rv.py
index 7f0a1d4dc4..8246c16ca3 100644
--- a/tests/variational/test_minibatch_rv.py
+++ b/tests/variational/test_minibatch_rv.py
@@ -20,8 +20,8 @@
 import pymc as pm
 
 from pymc import Normal, draw
+from pymc.testing import select_by_precision
 from pymc.variational.minibatch_rv import create_minibatch_rv
-from tests.helpers import select_by_precision
 from tests.test_data import gen1, gen2
 
 
diff --git a/tests/variational/test_opvi.py b/tests/variational/test_opvi.py
index af75a21a8f..84214197a4 100644
--- a/tests/variational/test_opvi.py
+++ b/tests/variational/test_opvi.py
@@ -15,7 +15,7 @@
 import functools as ft
 
 import numpy as np
-import pytensor.tensor as at
+import pytensor.tensor as pt
 import pytest
 
 import pymc as pm
@@ -39,7 +39,7 @@ def test_discrete_not_allowed():
 
     with pm.Model():
         mu = pm.Normal("mu", mu=0, sigma=10, size=3)
-        z = pm.Categorical("z", p=at.ones(3) / 3, size=len(y))
+        z = pm.Categorical("z", p=pt.ones(3) / 3, size=len(y))
         pm.Normal("y_obs", mu=mu[z], sigma=1.0, observed=y)
         with pytest.raises(opvi.ParametrizationError, match="Discrete variables"):
             pm.fit(n=1)  # fails