scikit-hep · henryiii · Apr 9, 2021 · Apr 7, 2021 · Apr 7, 2021 · Apr 7, 2021
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -4,6 +4,9 @@ Changelog
 IN PROGRESS
 --------------------
 
+* Add frequentist coverage interval support in the ``intervals`` module.
+  `#176 <https://github.com/scikit-hep/hist/pull/176>`_
+
 * Allow ``plot_pull`` to take a more generic callable or a string as a fitting function.
   Introduce an option to perform a likelihood fit. Write fit parameters' values
   and uncertainties in the legend.

diff --git a/docs/reference/hist.rst b/docs/reference/hist.rst
@@ -52,6 +52,14 @@ hist.hist module
    :undoc-members:
    :show-inheritance:
 
+hist.intervals module
+---------------------
+
+.. automodule:: hist.intervals
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 hist.namedhist module
 ---------------------
 

diff --git a/src/hist/__init__.py b/src/hist/__init__.py
@@ -13,7 +13,6 @@
 from .namedhist import NamedHist
 from .tag import loc, overflow, rebin, sum, underflow
 
-# Convenient access to the version number
 # Convenient access to the version number
 from .version import version as __version__
 

diff --git a/src/hist/intervals.py b/src/hist/intervals.py
@@ -0,0 +1,145 @@
+from typing import Any, Optional, Tuple
+
+import numpy as np
+
+from .typing import Literal
+
+try:
+    from scipy import stats
+except ModuleNotFoundError:
+    from sys import stderr
+
+    print(
+        "hist.intervals requires scipy. Please install hist[plot] or manually install scipy.",
+        file=stderr,
+    )
+    raise
+
+__all__ = ("poisson_interval", "clopper_pearson_interval", "ratio_uncertainty")
+
+
+def __dir__() -> Tuple[str, ...]:
+    return __all__
+
+
+def poisson_interval(
+    values: np.ndarray, variances: np.ndarray, coverage: "Optional[float]" = None
+) -> np.ndarray:
+    r"""
+    The Frequentist coverage interval for Poisson-distributed observations.
+
+    What is calculated is the "Garwood" interval,
+    c.f. https://www.ine.pt/revstat/pdf/rs120203.pdf or
+    http://ms.mcmaster.ca/peter/s743/poissonalpha.html.
+    For weighted data, this approximates the observed count by
+    ``values**2/variances``, which effectively scales the unweighted
+    Poisson interval by the average weight.
+    This may not be the optimal solution: see https://arxiv.org/abs/1309.1287
+    for a proper treatment.
+
+    When a bin is zero, the scale of the nearest nonzero bin is substituted to
+    scale the nominal upper bound.
+
+    Args:
+        values: Sum of weights.
+        variances: Sum of weights squared.
+        coverage: Central coverage interval.
+          Default is one standard deviation, which is roughly ``0.68``.
+
+    Returns:
+        The Poisson central coverage interval.
+    """
+    # Parts originally contributed to coffea
+    # https://github.com/CoffeaTeam/coffea/blob/8c58807e199a7694bf15e3803dbaf706d34bbfa0/LICENSE
+    if coverage is None:
+        coverage = stats.norm.cdf(1) - stats.norm.cdf(-1)
+    scale = np.empty_like(values)
+    scale[values != 0] = variances[values != 0] / values[values != 0]
+    if np.sum(values == 0) > 0:
+        missing = np.where(values == 0)
+        available = np.nonzero(values)
+        if len(available[0]) == 0:
+            raise RuntimeError(
+                "All values are zero! Cannot compute meaningful uncertainties.",
+            )
+        nearest = np.sum(
+            [np.square(np.subtract.outer(d, d0)) for d, d0 in zip(available, missing)]
+        ).argmin(axis=0)
+        argnearest = tuple(dim[nearest] for dim in available)
+        scale[missing] = scale[argnearest]
+    counts = values / scale
+    interval_min = scale * stats.chi2.ppf((1 - coverage) / 2, 2 * counts) / 2.0
+    interval_max = scale * stats.chi2.ppf((1 + coverage) / 2, 2 * (counts + 1)) / 2.0
+    interval = np.stack((interval_min, interval_max))
+    interval[interval == np.nan] = 0.0  # chi2.ppf produces nan for counts=0
+    return interval
+
+
+def clopper_pearson_interval(
+    num: np.ndarray, denom: np.ndarray, coverage: "Optional[float]" = None
+) -> np.ndarray:
+    r"""
+    Compute the Clopper-Pearson coverage interval for a binomial distribution.
+    c.f. http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
+
+    Args:
+        num: Numerator or number of successes.
+        denom: Denominator or number of trials.
+        coverage: Central coverage interval.
+          Default is one standard deviation, which is roughly ``0.68``.
+
+    Returns:
+        The Clopper-Pearson central coverage interval.
+    """
+    # Parts originally contributed to coffea
+    # https://github.com/CoffeaTeam/coffea/blob/8c58807e199a7694bf15e3803dbaf706d34bbfa0/LICENSE
+    if coverage is None:
+        coverage = stats.norm.cdf(1) - stats.norm.cdf(-1)
+    # Numerator is subset of denominator
+    if np.any(num > denom):
+        raise ValueError(
+            "Found numerator larger than denominator while calculating binomial uncertainty"
+        )
+    interval_min = stats.beta.ppf((1 - coverage) / 2, num, denom - num + 1)
+    interval_max = stats.beta.ppf((1 + coverage) / 2, num + 1, denom - num)
+    interval = np.stack((interval_min, interval_max))
+    interval[:, num == 0.0] = 0.0
+    interval[1, num == denom] = 1.0
+    return interval
+
+
+def ratio_uncertainty(
+    num: np.ndarray,
+    denom: np.ndarray,
+    uncertainty_type: Literal["poisson", "poisson-ratio"] = "poisson",
+) -> Any:
+    r"""
+    Calculate the uncertainties for the values of the ratio ``num/denom`` using
+    the specified coverage interval approach.
+
+    Args:
+        num: Numerator or number of successes.
+        denom: Denominator or number of trials.
+        uncertainty_type: Coverage interval type to use in the calculation of
+         the uncertainties.
+         ``"poisson"`` (default) implements the Poisson interval for the
+         numerator scaled by the denominator.
+         ``"poisson-ratio"`` implements the Clopper-Pearson interval for Poisson
+         distributed ``num`` and ``denom``.
+
+    Returns:
+        The uncertainties for the ratio.
+    """
+    # Note: As return is a numpy ufuncs the type is "Any"
+    with np.errstate(divide="ignore"):
+        ratio = num / denom
+    if uncertainty_type == "poisson":
+        ratio_uncert = np.abs(poisson_interval(ratio, num / np.square(denom)) - ratio)
+    elif uncertainty_type == "poisson-ratio":
+        # poisson ratio n/m is equivalent to binomial n/(n+m)
+        ratio_uncert = np.abs(clopper_pearson_interval(num, num + denom) - ratio)
+    else:
+        raise TypeError(
+            f"'{uncertainty_type}' is an invalid option for uncertainty_type."
+        )
+    return ratio_uncert
diff --git a/src/hist/plot.py b/src/hist/plot.py
@@ -232,7 +232,7 @@ def plot_pull(
     try:
         from iminuit import Minuit  # noqa: F401
         from scipy.optimize import curve_fit  # noqa: F401
-    except ImportError:
+    except ModuleNotFoundError:
         print(
             "Hist.plot_pull requires scipy and iminuit. Please install hist[plot] or manually install dependencies.",
             file=sys.stderr,

diff --git a/src/hist/typing.py b/src/hist/typing.py
@@ -2,9 +2,9 @@
 from typing import TYPE_CHECKING, Any, Tuple
 
 if sys.version_info < (3, 8):
-    from typing_extensions import Protocol, SupportsIndex
+    from typing_extensions import Literal, Protocol, SupportsIndex
 else:
-    from typing import Protocol, SupportsIndex
+    from typing import Literal, Protocol, SupportsIndex
 
 if TYPE_CHECKING:
     from numpy import ufunc as Ufunc
@@ -14,7 +14,7 @@
     Ufunc = Any
 
 
-__all__ = ("Protocol", "SupportsIndex", "Ufunc", "ArrayLike")
+__all__ = ("Literal", "Protocol", "SupportsIndex", "Ufunc", "ArrayLike")
 
 
 def __dir__() -> Tuple[str, ...]:

diff --git a/tests/test_intervals.py b/tests/test_intervals.py
@@ -0,0 +1,141 @@
+import numpy as np
+import pytest
+from pytest import approx
+
+from hist import Hist, axis, intervals
+
+
+@pytest.fixture(scope="session")
+def hist_fixture():
+    np.random.seed(42)
+
+    hist_1 = Hist(
+        axis.Regular(
+            10, -3, 3, name="X", label="x [units]", underflow=False, overflow=False
+        )
+    ).fill(np.random.normal(size=1000))
+    hist_2 = Hist(
+        axis.Regular(
+            10, -3, 3, name="X", label="x [units]", underflow=False, overflow=False
+        )
+    ).fill(np.random.normal(size=1700))
+
+    return hist_1, hist_2
+
+
+def test_poisson_interval(hist_fixture):
+    hist_1, hist_2 = hist_fixture
+
+    interval_min, interval_max = intervals.poisson_interval(
+        hist_1.values(), hist_2.values()
+    )
+
+    assert approx(interval_min) == np.array(
+        [
+            1.5300291697782136,
+            15.163319280895097,
+            70.76628228209115,
+            138.53885398885032,
+            206.76205061547802,
+            216.04895966121967,
+            146.23699499970348,
+            59.34874385941129,
+            23.42140089960769,
+            4.398298999080774,
+        ]
+    )
+    assert approx(interval_max) == np.array(
+        [
+            12.599700199673102,
+            28.738493673101413,
+            94.88918823365604,
+            173.30954997605485,
+            246.94963052163627,
+            257.713403993323,
+            181.58237748187338,
+            84.74029590412256,
+            38.20780361508876,
+            13.75903516119368,
+        ]
+    )
+
+
+def test_clopper_pearson_interval(hist_fixture):
+    hist_1, _ = hist_fixture
+
+    interval_min, interval_max = intervals.clopper_pearson_interval(
+        hist_1.values() * 0.8, hist_1.values()
+    )
+
+    assert approx(interval_min) == np.array(
+        [
+            0.4757493739959414,
+            0.6739341864914268,
+            0.745848569184471,
+            0.7626112365367469,
+            0.769793269861182,
+            0.7705165046817444,
+            0.7636693080808218,
+            0.7409965720106119,
+            0.6996942106437167,
+            0.5617068220945686,
+        ]
+    )
+
+    assert approx(interval_max) == np.array(
+        [
+            0.9660393063397832,
+            0.8909499770371633,
+            0.8458913477820733,
+            0.8331466664953922,
+            0.8273390876155183,
+            0.8267415410793807,
+            0.832305086850061,
+            0.8493900886167576,
+            0.8763181169809199,
+            0.9404385982116935,
+        ]
+    )
+
+
+def test_ratio_uncertainty(hist_fixture):
+    hist_1, hist_2 = hist_fixture
+
+    uncertainty_min, uncertainty_max = intervals.ratio_uncertainty(
+        hist_1.values(), hist_2.values(), uncertainty_type="poisson"
+    )
+
+    assert approx(uncertainty_min) == np.array(
+        [
+            0.1439794096271186,
+            0.12988019998066708,
+            0.0711565635066328,
+            0.045722288708959336,
+            0.04049103990124614,
+            0.038474711321686006,
+            0.045227104349518155,
+            0.06135954973309016,
+            0.12378460125991042,
+            0.19774186117590858,
+        ]
+    )
+
+    assert approx(uncertainty_max) == np.array(
+        [
+            0.22549817680979262,
+            0.1615766277480729,
+            0.07946632561746425,
+            0.04954668134626106,
+            0.04327624938437291,
+            0.04106267733757407,
+            0.04891233040201837,
+            0.06909296140898324,
+            0.1485919630151803,
+            0.2817958228477908,
+        ]
+    )
+
+    with pytest.raises(TypeError):
+        intervals.ratio_uncertainty(
+            hist_1.values(), hist_2.values(), uncertainty_type="fail"
+        )