diff --git a/docs/changelog.rst b/docs/changelog.rst index 920fdb97..c42f2204 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -4,6 +4,9 @@ Changelog IN PROGRESS -------------------- +* Add frequentist coverage interval support in the ``intervals`` module. + `#176 `_ + * Allow ``plot_pull`` to take a more generic callable or a string as a fitting function. Introduce an option to perform a likelihood fit. Write fit parameters' values and uncertainties in the legend. diff --git a/docs/reference/hist.rst b/docs/reference/hist.rst index 1892b6fc..4ae78a58 100644 --- a/docs/reference/hist.rst +++ b/docs/reference/hist.rst @@ -52,6 +52,14 @@ hist.hist module :undoc-members: :show-inheritance: +hist.intervals module +--------------------- + +.. automodule:: hist.intervals + :members: + :undoc-members: + :show-inheritance: + hist.namedhist module --------------------- diff --git a/src/hist/__init__.py b/src/hist/__init__.py index 8b9a96a5..21683fd5 100644 --- a/src/hist/__init__.py +++ b/src/hist/__init__.py @@ -13,7 +13,6 @@ from .namedhist import NamedHist from .tag import loc, overflow, rebin, sum, underflow -# Convenient access to the version number # Convenient access to the version number from .version import version as __version__ diff --git a/src/hist/intervals.py b/src/hist/intervals.py new file mode 100644 index 00000000..678e6b79 --- /dev/null +++ b/src/hist/intervals.py @@ -0,0 +1,145 @@ +from typing import Any, Optional, Tuple + +import numpy as np + +from .typing import Literal + +try: + from scipy import stats +except ModuleNotFoundError: + from sys import stderr + + print( + "hist.intervals requires scipy. Please install hist[plot] or manually install scipy.", + file=stderr, + ) + raise + +__all__ = ("poisson_interval", "clopper_pearson_interval", "ratio_uncertainty") + + +def __dir__() -> Tuple[str, ...]: + return __all__ + + +def poisson_interval( + values: np.ndarray, variances: np.ndarray, coverage: "Optional[float]" = None +) -> np.ndarray: + r""" + The Frequentist coverage interval for Poisson-distributed observations. + + What is calculated is the "Garwood" interval, + c.f. https://www.ine.pt/revstat/pdf/rs120203.pdf or + http://ms.mcmaster.ca/peter/s743/poissonalpha.html. + For weighted data, this approximates the observed count by + ``values**2/variances``, which effectively scales the unweighted + Poisson interval by the average weight. + This may not be the optimal solution: see https://arxiv.org/abs/1309.1287 + for a proper treatment. + + When a bin is zero, the scale of the nearest nonzero bin is substituted to + scale the nominal upper bound. + + Args: + values: Sum of weights. + variances: Sum of weights squared. + coverage: Central coverage interval. + Default is one standard deviation, which is roughly ``0.68``. + + Returns: + The Poisson central coverage interval. + """ + # Parts originally contributed to coffea + # https://github.com/CoffeaTeam/coffea/blob/8c58807e199a7694bf15e3803dbaf706d34bbfa0/LICENSE + if coverage is None: + coverage = stats.norm.cdf(1) - stats.norm.cdf(-1) + scale = np.empty_like(values) + scale[values != 0] = variances[values != 0] / values[values != 0] + if np.sum(values == 0) > 0: + missing = np.where(values == 0) + available = np.nonzero(values) + if len(available[0]) == 0: + raise RuntimeError( + "All values are zero! Cannot compute meaningful uncertainties.", + ) + nearest = np.sum( + [np.square(np.subtract.outer(d, d0)) for d, d0 in zip(available, missing)] + ).argmin(axis=0) + argnearest = tuple(dim[nearest] for dim in available) + scale[missing] = scale[argnearest] + counts = values / scale + interval_min = scale * stats.chi2.ppf((1 - coverage) / 2, 2 * counts) / 2.0 + interval_max = scale * stats.chi2.ppf((1 + coverage) / 2, 2 * (counts + 1)) / 2.0 + interval = np.stack((interval_min, interval_max)) + interval[interval == np.nan] = 0.0 # chi2.ppf produces nan for counts=0 + return interval + + +def clopper_pearson_interval( + num: np.ndarray, denom: np.ndarray, coverage: "Optional[float]" = None +) -> np.ndarray: + r""" + Compute the Clopper-Pearson coverage interval for a binomial distribution. + c.f. http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval + + Args: + num: Numerator or number of successes. + denom: Denominator or number of trials. + coverage: Central coverage interval. + Default is one standard deviation, which is roughly ``0.68``. + + Returns: + The Clopper-Pearson central coverage interval. + """ + # Parts originally contributed to coffea + # https://github.com/CoffeaTeam/coffea/blob/8c58807e199a7694bf15e3803dbaf706d34bbfa0/LICENSE + if coverage is None: + coverage = stats.norm.cdf(1) - stats.norm.cdf(-1) + # Numerator is subset of denominator + if np.any(num > denom): + raise ValueError( + "Found numerator larger than denominator while calculating binomial uncertainty" + ) + interval_min = stats.beta.ppf((1 - coverage) / 2, num, denom - num + 1) + interval_max = stats.beta.ppf((1 + coverage) / 2, num + 1, denom - num) + interval = np.stack((interval_min, interval_max)) + interval[:, num == 0.0] = 0.0 + interval[1, num == denom] = 1.0 + return interval + + +def ratio_uncertainty( + num: np.ndarray, + denom: np.ndarray, + uncertainty_type: Literal["poisson", "poisson-ratio"] = "poisson", +) -> Any: + r""" + Calculate the uncertainties for the values of the ratio ``num/denom`` using + the specified coverage interval approach. + + Args: + num: Numerator or number of successes. + denom: Denominator or number of trials. + uncertainty_type: Coverage interval type to use in the calculation of + the uncertainties. + ``"poisson"`` (default) implements the Poisson interval for the + numerator scaled by the denominator. + ``"poisson-ratio"`` implements the Clopper-Pearson interval for Poisson + distributed ``num`` and ``denom``. + + Returns: + The uncertainties for the ratio. + """ + # Note: As return is a numpy ufuncs the type is "Any" + with np.errstate(divide="ignore"): + ratio = num / denom + if uncertainty_type == "poisson": + ratio_uncert = np.abs(poisson_interval(ratio, num / np.square(denom)) - ratio) + elif uncertainty_type == "poisson-ratio": + # poisson ratio n/m is equivalent to binomial n/(n+m) + ratio_uncert = np.abs(clopper_pearson_interval(num, num + denom) - ratio) + else: + raise TypeError( + f"'{uncertainty_type}' is an invalid option for uncertainty_type." + ) + return ratio_uncert diff --git a/src/hist/plot.py b/src/hist/plot.py index 8b7855eb..2cec5e03 100644 --- a/src/hist/plot.py +++ b/src/hist/plot.py @@ -232,7 +232,7 @@ def plot_pull( try: from iminuit import Minuit # noqa: F401 from scipy.optimize import curve_fit # noqa: F401 - except ImportError: + except ModuleNotFoundError: print( "Hist.plot_pull requires scipy and iminuit. Please install hist[plot] or manually install dependencies.", file=sys.stderr, diff --git a/src/hist/typing.py b/src/hist/typing.py index 177b668e..d3c3aa6d 100644 --- a/src/hist/typing.py +++ b/src/hist/typing.py @@ -2,9 +2,9 @@ from typing import TYPE_CHECKING, Any, Tuple if sys.version_info < (3, 8): - from typing_extensions import Protocol, SupportsIndex + from typing_extensions import Literal, Protocol, SupportsIndex else: - from typing import Protocol, SupportsIndex + from typing import Literal, Protocol, SupportsIndex if TYPE_CHECKING: from numpy import ufunc as Ufunc @@ -14,7 +14,7 @@ Ufunc = Any -__all__ = ("Protocol", "SupportsIndex", "Ufunc", "ArrayLike") +__all__ = ("Literal", "Protocol", "SupportsIndex", "Ufunc", "ArrayLike") def __dir__() -> Tuple[str, ...]: diff --git a/tests/test_intervals.py b/tests/test_intervals.py new file mode 100644 index 00000000..b09a6752 --- /dev/null +++ b/tests/test_intervals.py @@ -0,0 +1,141 @@ +import numpy as np +import pytest +from pytest import approx + +from hist import Hist, axis, intervals + + +@pytest.fixture(scope="session") +def hist_fixture(): + np.random.seed(42) + + hist_1 = Hist( + axis.Regular( + 10, -3, 3, name="X", label="x [units]", underflow=False, overflow=False + ) + ).fill(np.random.normal(size=1000)) + hist_2 = Hist( + axis.Regular( + 10, -3, 3, name="X", label="x [units]", underflow=False, overflow=False + ) + ).fill(np.random.normal(size=1700)) + + return hist_1, hist_2 + + +def test_poisson_interval(hist_fixture): + hist_1, hist_2 = hist_fixture + + interval_min, interval_max = intervals.poisson_interval( + hist_1.values(), hist_2.values() + ) + + assert approx(interval_min) == np.array( + [ + 1.5300291697782136, + 15.163319280895097, + 70.76628228209115, + 138.53885398885032, + 206.76205061547802, + 216.04895966121967, + 146.23699499970348, + 59.34874385941129, + 23.42140089960769, + 4.398298999080774, + ] + ) + assert approx(interval_max) == np.array( + [ + 12.599700199673102, + 28.738493673101413, + 94.88918823365604, + 173.30954997605485, + 246.94963052163627, + 257.713403993323, + 181.58237748187338, + 84.74029590412256, + 38.20780361508876, + 13.75903516119368, + ] + ) + + +def test_clopper_pearson_interval(hist_fixture): + hist_1, _ = hist_fixture + + interval_min, interval_max = intervals.clopper_pearson_interval( + hist_1.values() * 0.8, hist_1.values() + ) + + assert approx(interval_min) == np.array( + [ + 0.4757493739959414, + 0.6739341864914268, + 0.745848569184471, + 0.7626112365367469, + 0.769793269861182, + 0.7705165046817444, + 0.7636693080808218, + 0.7409965720106119, + 0.6996942106437167, + 0.5617068220945686, + ] + ) + + assert approx(interval_max) == np.array( + [ + 0.9660393063397832, + 0.8909499770371633, + 0.8458913477820733, + 0.8331466664953922, + 0.8273390876155183, + 0.8267415410793807, + 0.832305086850061, + 0.8493900886167576, + 0.8763181169809199, + 0.9404385982116935, + ] + ) + + +def test_ratio_uncertainty(hist_fixture): + hist_1, hist_2 = hist_fixture + + uncertainty_min, uncertainty_max = intervals.ratio_uncertainty( + hist_1.values(), hist_2.values(), uncertainty_type="poisson" + ) + + assert approx(uncertainty_min) == np.array( + [ + 0.1439794096271186, + 0.12988019998066708, + 0.0711565635066328, + 0.045722288708959336, + 0.04049103990124614, + 0.038474711321686006, + 0.045227104349518155, + 0.06135954973309016, + 0.12378460125991042, + 0.19774186117590858, + ] + ) + + assert approx(uncertainty_max) == np.array( + [ + 0.22549817680979262, + 0.1615766277480729, + 0.07946632561746425, + 0.04954668134626106, + 0.04327624938437291, + 0.04106267733757407, + 0.04891233040201837, + 0.06909296140898324, + 0.1485919630151803, + 0.2817958228477908, + ] + ) + + with pytest.raises(TypeError): + intervals.ratio_uncertainty( + hist_1.values(), hist_2.values(), uncertainty_type="fail" + )