Source code for pyhs3.distributions.histfactory

"""
HistFactory Distribution implementation.

Provides the HistFactoryDist class for handling binned statistical models
with samples and modifiers as defined in the HS3 specification.
"""

from __future__ import annotations

from typing import Any, Literal, cast

import pytensor.tensor as pt
from pydantic import Field

from pyhs3.context import Context

# Import existing distributions for constraint terms
from pyhs3.distributions.core import Distribution
from pyhs3.distributions.histfactory.axes import Axes, BinnedAxis
from pyhs3.distributions.histfactory.modifiers import HasConstraint, Modifier
from pyhs3.distributions.histfactory.samples import Sample, Samples
from pyhs3.networks import HasDependencies, HasInternalNodes
from pyhs3.typing.aliases import TensorVar



[docs]
class HistFactoryDistChannel(Distribution, HasInternalNodes):
    r"""
    HistFactory probability distribution for a single channel/region.

    Implements binned statistical models consisting of histograms (step functions)
    with various modifiers as defined in the HS3 specification. Each HistFactoryDistChannel
    represents one independent measurement channel/region with its own observed data.
    Multiple channels can be combined in a workspace to form a complete HistFactory model.

    The total likelihood consists of:
    1. **Main likelihood**: Poisson likelihood for observed bin counts vs expected rates
    2. **Constraint likelihoods**: Constraint terms for nuisance parameters

    The prediction for a binned region is given as:

    .. math::

        \lambda(x) = \sum_{s \in \text{samples}} \left[
            \left( d_s(x) + \sum_{\delta \in M_\delta} \delta(x,\theta_\delta) \right)
            \prod_{\kappa \in M_\kappa} \kappa(x,\theta_\kappa)
        \right]

    where:
        - :math:`d_s(x)` is the nominal prediction for sample :math:`s`
        - :math:`M_\delta` are additive modifiers (histosys)
        - :math:`M_\kappa` are multiplicative modifiers (normfactor, normsys, shapefactor, etc.)

    **Observed Data Convention:**
        Observed data must be provided in the context as ``{name}_observed`` where
        ``name`` is the HistFactory distribution name. This is required for likelihood
        evaluation.

    **Constraint Types:**
        - **Gaussian constraints** (default): histosys, normsys, staterror
        - **Poisson constraints** (default): shapesys
        - All constraint types can be overridden via the ``constraint`` field

    Parameters:
        axes (list): Array of axis definitions with binning information
        samples (list): Array of sample definitions with data and modifiers

    Supported Modifiers:
        - normfactor: Multiplicative scaling by parameter value
        - normsys: Multiplicative systematic with hi/lo interpolation
        - histosys: Additive correlated shape systematic
        - shapefactor: Uncorrelated multiplicative bin-by-bin scaling
        - shapesys: Uncorrelated shape systematic with Poisson constraints
        - staterror: Statistical uncertainty via Barlow-Beeston method

    Modifier Naming in Dependency Graph:
        Modifiers have simple names (e.g., "lumi") in the HS3 specification, but are
        given unique identifiers in the dependency graph by prepending the full context:
        ``{dist_name}/{sample_name}/{modifier_type}/{modifier_name}``

        This design distinguishes individual modifier instances while allowing parameters
        to indicate correlation - modifiers sharing the same parameter name are correlated.

        Example: Two modifiers both named "lumi" in different samples will have unique
        graph nodes like "SR/signal/normsys/lumi" and "CR/background/normsys/lumi", but
        if they share the same parameter name, they are correlated.

    HS3 Reference:
        :hs3:label:`histfactory_dist <hs3.histfactory-distribution>`
    """

    type: Literal["histfactory_dist"] = "histfactory_dist"
    axes: Axes = Field(..., json_schema_extra={"preprocess": False})
    samples: Samples = Field(..., json_schema_extra={"preprocess": False})

    def get_internal_nodes(self) -> list[Any]:
        """
        Return all internal nodes that need to be in the dependency graph.

        Modifiers can have the same name across different samples/types (e.g., "Lumi"
        appearing in multiple places), but the dependency graph requires unique node
        identifiers. We create wrapper objects that provide unique names while
        delegating to the original modifier's functionality.
        """
        nodes = []

        for sample in self.samples:
            for modifier in sample.modifiers:
                # Create unique node name: {dist_name}/{sample_name}/{modifier_type}/{modifier_name}
                # This ensures uniqueness even when modifier names are reused across samples
                node_name = f"{self.name}/{sample.name}/{modifier.type}/{modifier.name}"

                # Create a lightweight wrapper that provides the unique name for the dependency graph
                # while delegating all functionality to the original modifier
                class ModifierNode(HasDependencies):
                    """
                    Wrapper Modifier to provide a globally unique internal name for the dependency graph.
                    """

                    def __init__(self, name: str, modifier: Modifier):
                        self.name = name
                        self._modifier = modifier

                    @property
                    def dependencies(self) -> set[str]:
                        return self._modifier.dependencies

                    def expression(self, context: Context) -> TensorVar:
                        return self._modifier.expression(context)

                nodes.append(ModifierNode(node_name, modifier))

        return nodes

    @property
    def parameters(self) -> set[str]:
        """Return all parameters used by this HistFactory distribution."""
        params = set()
        # HistFactory distribution requires observed data parameter
        params.add(f"{self.name}_observed")
        # Collect parameters from all modifiers
        for sample in self.samples:
            for modifier in sample.modifiers:
                params.update(modifier.dependencies)
        return params

    def likelihood(self, context: Context) -> TensorVar:
        """
        Build the HistFactory main Poisson likelihood.

        Returns the Poisson probability for observed bin counts vs expected rates.
        Does NOT include constraint terms - those are added via extended_likelihood().

        Args:
            context: Mapping of parameter names to PyTensor variables

        Returns:
            PyTensor expression for the main Poisson model probability
        """
        # Extract binning information
        total_bins = self._get_total_bins()

        # Process all samples and compute expected rates
        expected_rates = self._compute_expected_rates(context, total_bins)

        # Build main Poisson model for observed data
        return self._build_main_model(context, expected_rates)

    def extended_likelihood(
        self, context: Context, _data: TensorVar | None = None
    ) -> TensorVar:
        """
        Build constraint model for nuisance parameters.

        Args:
            context: Mapping of parameter names to PyTensor variables
            data: Not used for HistFactory constraints (observed data is in main model)

        Returns:
            PyTensor expression for the constraint likelihood terms
        """
        constraint_probs = []

        # Collect all constraint terms from modifiers
        for sample in self.samples:
            for modifier in sample.modifiers:
                if isinstance(modifier, HasConstraint):
                    prob = modifier.make_constraint(context, sample.data)
                    constraint_probs.append(prob)

        if not constraint_probs:
            return pt.constant(1.0)

        # Multiply all constraint probabilities
        return cast(TensorVar, pt.prod(pt.stack(constraint_probs)))  # type: ignore[no-untyped-call]

    def _get_total_bins(self) -> int:
        """Calculate total number of bins across all axes."""
        return self.axes.get_total_bins()

    def _compute_expected_rates(self, context: Context, total_bins: int) -> TensorVar:
        """
        Compute expected event rates for all bins.

        Applies all modifiers to sample predictions to get final rates.
        """
        # Start with zeros for total prediction
        total_rates = pt.zeros(total_bins)  # type: ignore[no-untyped-call]

        # Process each sample
        for sample in self.samples:
            sample_rates = self._process_sample(context, sample, total_bins)
            total_rates = total_rates + sample_rates

        return cast(TensorVar, total_rates)

    def _process_sample(
        self, context: Context, sample: Sample, total_bins: int
    ) -> TensorVar:
        """Process a single sample with its modifiers."""
        # Get nominal bin contents
        contents = sample.data.contents
        if len(contents) != total_bins:
            msg = (
                f"Sample {sample.name} has {len(contents)} bins, expected {total_bins}"
            )
            raise ValueError(msg)

        nominal_rates = pt.as_tensor_variable(contents)

        # Apply modifiers using pre-computed results where possible
        modified_rates = nominal_rates

        for modifier in sample.modifiers:
            modified_rates = modifier.apply(context, modified_rates)

        return cast(TensorVar, modified_rates)

    def _build_main_model(
        self, context: Context, expected_rates: TensorVar
    ) -> TensorVar:
        """
        Build the main Poisson model for observed data.

        Observed data must be provided in the context as '{name}_observed' where
        name is the HistFactory distribution name. This is a required parameter
        for likelihood evaluation.

        Returns:
            PyTensor expression for the Poisson probability (not log probability)
        """
        # Create a Poisson likelihood for the observed bin counts
        # Observed data is required - no defensive programming needed
        observed_data_param = f"{self.name}_observed"
        observed_data = context[observed_data_param]

        # Build product of individual Poisson probabilities for each bin
        # P(observed_i | expected_i) = exp(observed_i * log(expected_i) - expected_i - log(observed_i!))
        log_probs = (
            observed_data * pt.log(expected_rates)
            - expected_rates
            - pt.gammaln(observed_data + 1)
        )
        # Convert from log probabilities to probabilities
        probs = pt.exp(log_probs)
        main_prob = pt.prod(probs)  # type: ignore[no-untyped-call]

        return cast(TensorVar, main_prob)



# Registry of histfactory distributions
distributions: dict[str, type[Distribution]] = {
    "histfactory_dist": HistFactoryDistChannel,
}

# Define what should be exported from this module
__all__ = [
    "Axes",
    "BinnedAxis",
    "HistFactoryDistChannel",
    "distributions",
]