Source code for pyhs3.data

"""
HS3 Data implementations.

Provides Pydantic classes for handling HS3 data specifications
including point data, unbinned data, and binned data with uncertainties.
"""

from __future__ import annotations

from typing import Annotated, Literal

import hist
import numpy as np
from pydantic import (
    BaseModel,
    Field,
    model_validator,
)

from pyhs3.axes import BinnedAxis, UnbinnedAxis
from pyhs3.collections import NamedCollection, NamedModel
from pyhs3.exceptions import custom_error_msg


[docs] class GaussianUncertainty(BaseModel): """ Gaussian uncertainty specification for data. Attributes: type: Must be "gaussian_uncertainty" sigma: Standard deviations for each data point correlation: Correlation matrix or 0 for no correlation """ type: Literal["gaussian_uncertainty"] = Field( default="gaussian_uncertainty", repr=False ) sigma: list[float] = Field(..., repr=False) correlation: list[list[float]] | Literal[0] = Field(default=0, repr=False) @model_validator(mode="after") def validate_correlation(self) -> GaussianUncertainty: """Validate correlation matrix dimensions.""" if self.correlation != 0: n = len(self.sigma) if len(self.correlation) != n: msg = f"Correlation matrix must be {n}x{n} to match sigma length" raise ValueError(msg) for row in self.correlation: # pylint: disable=not-an-iterable if len(row) != n: msg = f"Correlation matrix must be {n}x{n} to match sigma length" raise ValueError(msg) return self
[docs] class Datum(NamedModel): """ Base class for HS3 data specifications. Provides the foundation for all data implementations, handling common properties like name and type identification. Attributes: name: Custom string identifier for the data type: Type identifier for the data format """ type: str = Field(..., repr=False)
[docs] class PointData(Datum): """ Point data specification for single measurements. Represents a single measured value with optional uncertainty. Attributes: name: Custom string identifier type: Must be "point" value: Measured value uncertainty: Optional uncertainty/error axes: Optional axes for observable bounds (for normalization) """ type: Literal["point"] = Field(default="point", repr=False) value: float = Field(..., repr=False) uncertainty: float | None = Field(default=None, repr=False) axes: list[UnbinnedAxis] | None = Field(default=None, repr=False)
[docs] class UnbinnedData(Datum): """ Unbinned data specification for multiple data points. Represents individual data points in multi-dimensional space with optional weights and uncertainties. Attributes: name: Custom string identifier type: Must be "unbinned" entries: Array of coordinate arrays for each data point axes: Axis specifications defining coordinate system (UnbinnedAxis with required min/max) weights: Optional weights for each entry entries_uncertainties: Optional uncertainties for each coordinate """ type: Literal["unbinned"] = Field(default="unbinned", repr=False) entries: list[list[float]] = Field(..., repr=False) axes: list[UnbinnedAxis] = Field(..., repr=False) weights: list[float] | None = Field(default=None, repr=False) entries_uncertainties: list[list[float]] | None = Field(default=None, repr=False) @model_validator(mode="after") def validate_unbinned_data(self) -> UnbinnedData: """Validate consistency of unbinned data arrays.""" n_entries = len(self.entries) # Check weights length if self.weights is not None and len(self.weights) != n_entries: msg = f"Weights array length ({len(self.weights)}) must match entries length ({n_entries})" raise ValueError(msg) # Check uncertainties shape if self.entries_uncertainties is not None: if len(self.entries_uncertainties) != n_entries: msg = f"Uncertainties array length ({len(self.entries_uncertainties)}) must match entries length ({n_entries})" raise ValueError(msg) # Check each entry has same dimensionality if n_entries > 0: expected_dims = len(self.entries[0]) for i, entry_unc in enumerate(self.entries_uncertainties): if len(entry_unc) != expected_dims: msg = f"Entry uncertainties[{i}] has {len(entry_unc)} dimensions, expected {expected_dims}" raise ValueError(msg) # Check entries dimensionality matches axes if n_entries > 0: entry_dims = len(self.entries[0]) if entry_dims != len(self.axes): msg = f"Entry dimensionality ({entry_dims}) must match number of axes ({len(self.axes)})" raise ValueError(msg) # Check all entries have same dimensionality for i, entry in enumerate(self.entries): if len(entry) != entry_dims: msg = ( f"Entry[{i}] has {len(entry)} dimensions, expected {entry_dims}" ) raise ValueError(msg) return self @property def weighted_entries(self) -> np.ndarray: """ Entries array with each row multiplied by its event weight. Returns a numpy array of shape ``(n_events, n_axes)`` — the same structure as ``entries`` — where each row ``i`` is scaled by ``weights[i]``. When no weights are present the result equals ``np.array(self.entries)``. Axis values can be extracted with standard numpy indexing, e.g. ``data.weighted_entries[:, 0]`` for the first observable. Threshold filtering and sorting are left to the caller:: vals = data.weighted_entries[:, 0] vals = np.sort(vals[np.abs(vals) > 1e-6]) Returns: ndarray of shape (n_events, n_axes) """ if not self.entries: return np.empty((0, len(self.axes)), dtype=np.float64) arr = np.asarray(self.entries, dtype=np.float64) if self.weights is not None: arr = arr * np.asarray(self.weights, dtype=np.float64)[:, np.newaxis] return arr def to_hist( self, nbins: int = 50 ) -> hist.Hist[hist.storage.Weight | hist.storage.Double]: """ Convert to scikit-hep hist.Hist object by binning entries. Creates a hist.Hist histogram by binning the unbinned entries according to the axis specifications. The resulting histogram can be plotted using matplotlib or other visualization tools. Args: nbins: Number of bins to use for each axis (default: 50) Returns: hist.Hist: Histogram representation with: - Axes matching the data axes - Values from binned entries - Weights if provided Examples: >>> entries = [[0.5], [1.2], [1.8]] >>> axes = [UnbinnedAxis(name="x", min=0, max=3)] >>> data = UnbinnedData( ... name="example", ... type="unbinned", ... entries=entries, ... axes=axes ... ) >>> data.to_hist(nbins=3) Hist(Regular(3, 0, 3, name='x'), storage=Double()) # Sum: 3.0 """ # Convert axes to hist.axis objects # UnbinnedAxis doesn't have to_hist(), so create Regular axes manually hist_axes = [ hist.axis.Regular(nbins, axis.min, axis.max, name=axis.name) for axis in self.axes ] # Create histogram with appropriate storage storage = ( hist.storage.Weight() if self.weights is not None else hist.storage.Double() ) h = hist.Hist(*hist_axes, storage=storage) # Transpose entries from [[x1, y1], [x2, y2]] to [[x1, x2], [y1, y2]] if len(self.entries) > 0: entries_transposed = list(zip(*self.entries, strict=True)) # Convert to numpy arrays for filling fill_args = [np.array(coord_list) for coord_list in entries_transposed] # Fill the histogram if self.weights is not None: h.fill(*fill_args, weight=np.array(self.weights)) else: h.fill(*fill_args) return h
[docs] class BinnedData(Datum): """ Binned data specification for histogram data. Represents binned/histogram data in multi-dimensional space with optional uncertainties and correlations. Attributes: name: Custom string identifier type: Must be "binned" contents: Bin contents array axes: Axis specifications defining binning (BinnedAxis with binning info) uncertainty: Optional uncertainty specification """ type: Literal["binned"] = Field(default="binned", repr=False) contents: list[float] = Field(..., repr=False) axes: list[BinnedAxis] = Field(..., repr=False) uncertainty: GaussianUncertainty | None = Field(default=None, repr=False) @model_validator(mode="after") def validate_binned_data(self) -> BinnedData: """Validate binned data consistency.""" # Calculate expected number of bins # BinnedAxis.validate_binning already ensures each axis has valid binning expected_bins = 1 for axis in self.axes: expected_bins *= axis.nbins # Check contents length if len(self.contents) != expected_bins: msg = f"Contents array length ({len(self.contents)}) must match expected number of bins ({expected_bins})" raise ValueError(msg) # Check uncertainty consistency if self.uncertainty is not None and len(self.uncertainty.sigma) != len( self.contents ): msg = f"Uncertainty sigma length ({len(self.uncertainty.sigma)}) must match contents length ({len(self.contents)})" raise ValueError(msg) return self def to_hist(self) -> hist.Hist[hist.storage.Weight | hist.storage.Double]: """ Convert to scikit-hep hist.Hist object for visualization. Creates a hist.Hist histogram from this binned data. The resulting histogram can be plotted using matplotlib or other visualization tools. Note: Correlation matrices in uncertainties are not preserved. Only the sigma values (standard deviations) are included as histogram variances. Returns: hist.Hist: Histogram representation with: - Axes matching the data axes - Values from contents - Variances from uncertainties if present Examples: >>> data = BinnedData( ... name="example", ... type="binned", ... contents=[10, 20, 15], ... axes=[{"name": "x", "min": 0, "max": 3, "nbins": 3}] ... ) >>> data.to_hist() Hist(Regular(3, 0, 3, name='x'), storage=Double()) # Sum: 45.0 """ # Convert axes to hist.axis objects hist_axes = [axis.to_hist() for axis in self.axes] # Create histogram with appropriate storage storage = ( hist.storage.Weight() if self.uncertainty is not None else hist.storage.Double() ) h = hist.Hist(*hist_axes, storage=storage) # Calculate shape from axes shape = tuple(axis.nbins for axis in self.axes) # Reshape contents for assignment if self.uncertainty is not None: # Reshape both contents and variances contents_nd = np.array(self.contents).reshape(shape) variances_nd = np.square(self.uncertainty.sigma).reshape(shape) stacked = np.stack([contents_nd, variances_nd], axis=-1) h[...] = stacked else: # Reshape and set contents using view contents_nd = np.array(self.contents).reshape(shape) h[...] = contents_nd return h
# Type alias for all data types using discriminated union DataType = Annotated[PointData | UnbinnedData | BinnedData, Field(discriminator="type")]
[docs] class Data(NamedCollection[DataType]): """ Collection of HS3 data specifications. Manages a set of data instances that define observed data for likelihood evaluations. Provides dict-like access to data by name. """ root: Annotated[ list[DataType], custom_error_msg( { "union_tag_not_found": "Data entry missing required 'type' field. Expected one of: 'point', 'unbinned', 'binned'", "union_tag_invalid": "Unknown data type '{tag}' does not match any of the expected types: {expected_tags}", } ), ] = Field(default_factory=list)