Source code for velociraptor.catalogue.soap_catalogue

from __future__ import annotations

import numpy as np
import h5py

from typing import List, Set, Union, Dict
from types import SimpleNamespace

from velociraptor.catalogue.catalogue import Catalogue
from velociraptor.catalogue.translator import VR_to_SOAP

from functools import reduce

import unyt
from swiftsimio.conversions import swift_cosmology_to_astropy
from swiftsimio.metadata.unit.unit_types import find_nearest_base_unit
from swiftsimio.metadata.unit.unit_types import unit_names_to_unyt

from abc import ABC, abstractmethod
from pathlib import Path


[docs]class SWIFTUnitsMockup(object):
    """
    Takes a dict with internal swift units and generates
    a unyt system compatible with swiftsimio.

    Attributes
    ----------
    mass : float
        unit for mass used
    length : float
        unit for length used
    time : float
        unit for time used
    current : float
        unit for current used
    temperature : float
        unit for temperature used
    """

    def __init__(self, units: Dict):
        """
        Parameters
        ----------
        units : Dict
            Dictionary with internal units from the swift snapshot
            metadata saved in the SOAP catalogue
        """

        unyt_units = {
            name: unyt.unyt_quantity(value[0], units=unit_names_to_unyt[name])
            for name, value in units.items()
        }

        self.mass = find_nearest_base_unit(unyt_units["Unit mass in cgs (U_M)"], "mass")
        self.length = find_nearest_base_unit(
            unyt_units["Unit length in cgs (U_L)"], "length"
        )
        self.time = find_nearest_base_unit(unyt_units["Unit time in cgs (U_t)"], "time")
        self.current = find_nearest_base_unit(
            unyt_units["Unit current in cgs (U_I)"], "current"
        )
        self.temperature = find_nearest_base_unit(
            unyt_units["Unit temperature in cgs (U_T)"], "temperature"
        )

        return


[docs]class CatalogueElement(object):
    """
    Abstract class for catalogue elements. These map to specific objects in
    the SOAP output file.

    The SOAP output file is a tree structure with HDF5 groups that contain
    either more HDF5 groups or HDF5 datasets. Each group/dataset has a name
    that corresponds to its path in the SOAP file.
    """

    # path to the SOAP file
    file_name: Path
    # name of the HDF5 group/dataset in the SOAP file
    name: str

    def __init__(self, file_name: Path, name: str):
        """
        Constructor.

        Parameters:
         - file_name: Path
           Path to the SOAP catalogue file.
         - name: str
           Path of the dataset/group in the SOAP catalogue file.
        """
        self.file_name = file_name
        self.name = name


[docs]class CatalogueDataset(CatalogueElement):
    """
    Representation of a SOAP dataset.

    A dataset has unit metadata and values that are only read if the dataset
    is actually used.
    """

    # conversion factor from SOAP units to a unyt_array with units
    conversion_factor: Union[unyt.unyt_quantity, None]
    # value of the dataset. Only set when the dataset is actually used
    _value: Union[unyt.unyt_array, None]

    def __init__(self, file_name: Path, name: str, handle: h5py.File):
        """
        Constructor.

        Parameters:
         - file_name: Path
           Path to the SOAP catalogue file.
         - name: str
           Path of the dataset in the SOAP catalogue file.
         - handle: h5py.File
           HDF5 file handle. Used to avoid having to open and close the file
           in the constructor.
        """
        super().__init__(file_name, name)

        self._value = None
        self.conversion_factor = None
        self._register_metadata(handle)

    def _register_metadata(self, handle: h5py.File):
        """
        Read the unit metadata from the HDF5 file and store it in the conversion
        factor.

        Parameters:
         - handle: h5py.File
           HDF5 file handle. Used to avoid having to open and close the file
           in the constructor.
        """
        metadata = handle[self.name].attrs
        factor = (
            metadata["Conversion factor to CGS (including cosmological corrections)"][0]
            * unyt.A ** metadata["U_I exponent"][0]
            * unyt.cm ** metadata["U_L exponent"][0]
            * unyt.g ** metadata["U_M exponent"][0]
            * unyt.K ** metadata["U_T exponent"][0]
            * unyt.s ** metadata["U_t exponent"][0]
        )
        self.conversion_factor = unyt.unyt_quantity(factor)
        # avoid overflow by setting the base unit system to something that works
        # well for cosmological simulations
        self.conversion_factor.convert_to_base("galactic")

[docs]    def set_value(self, value: unyt.unyt_array, group: CatalogueGroup):
        """
        Setter for the dataset values.

        Parameters:
         - value: unyt.unyt_array
           New values for the dataset.
         - group: CatalogueGroup
           Group this dataset belongs to. Only provided for property()
           compatibility (since we want the dataset to be a property of the
           CatalogueGroup object).
        """
        self._value = value

[docs]    def del_value(self, group: CatalogueGroup):
        """
        Deleter for the dataset values.

        Parameters:
         - group: CatalogueGroup
           Group this dataset belongs to. Only provided for property()
           compatibility (since we want the dataset to be a property of the
           CatalogueGroup object).
        """
        del self._value
        self._value = None

[docs]    def get_value(self, group: CatalogueGroup) -> unyt.unyt_array:
        """
        Getter for the dataset values.
        Performs lazy reading: if the value has not been read before, it is
        read from the SOAP catalogue file. Otherwise, a buffered value is used.

        Parameters:
         - group: CatalogueGroup
           Group this dataset belongs to. Only provided for property()
           compatibility (since we want the dataset to be a property of the
           CatalogueGroup object).

        Returns the dataset values as a unyt.unyt_array.
        """
        if self._value is None:
            with h5py.File(self.file_name, "r") as handle:
                self._value = handle[self.name][:] * self.conversion_factor
            self._value.name = self.name.replace("/", " ").replace("_", "").strip()
        return self._value


[docs]class CatalogueDerivedDataset(CatalogueElement, ABC):
    """
    Representation of a derived SOAP dataset.

    A derived dataset is a dataset that can be trivially derived from another
    (set of) dataset(s). It is similar to a dataset registered using a
    registration function, but still supports lazy evaluation. Its main
    purpose is to guarantee full compatibility between the SOAP and VR
    catalogues in the pipeline.
    """

    # list of datasets that are required to compute this derived dataset
    terms: List[CatalogueDataset]
    # value of the dataset. Only set when the dataset is actually used
    _value: Union[unyt.unyt_array, None]

    def __init__(self, file_name: Path, name: str, terms: List[CatalogueDataset]):
        """
        Constructor.

        Parameters:
         - file_name: Path
           Path to the SOAP catalogue file.
         - name: str
           (Fake) path of the dataset in the SOAP catalogue file.
         - terms: List[CatalogueDataset]
           List of datasets that are required to compute the derived dataset.
        """
        super().__init__(file_name, name)
        self._value = None
        self.terms = list(terms)

[docs]    def set_value(self, value: unyt.unyt_array, group: CatalogueGroup):
        """
        Setter for the dataset values.

        Parameters:
         - value: unyt.unyt_array
           New values for the dataset.
         - group: CatalogueGroup
           Group this dataset belongs to. Only provided for property()
           compatibility (since we want the dataset to be a property of the
           CatalogueGroup object).
        """
        self._value = value

[docs]    def del_value(self, group: CatalogueDataset):
        """
        Deleter for the dataset values.

        Parameters:
         - group: CatalogueGroup
           Group this dataset belongs to. Only provided for property()
           compatibility (since we want the dataset to be a property of the
           CatalogueGroup object).
        """
        del self._value
        self._value = None

[docs]    def get_value(self, group: CatalogueDataset) -> unyt.unyt_array:
        """
        Getter for the dataset values.
        Performs lazy evaluation: if the value has not been computed before,
        all the datasets that are required to compute it are obtained (and
        might be lazily read at this point), and the child class specific
        computation method is called. Otherwise, a buffered value is returned.

        Parameters:
         - group: CatalogueGroup
           Group this dataset belongs to. Only provided for property()
           compatibility (since we want the dataset to be a property of the
           CatalogueGroup object).

        Returns the dataset values as a unyt.unyt_array.
        """
        if self._value is None:
            values = [term.get_value(group) for term in self.terms]
            self._value = self.compute_value(*values)
        return self._value

[docs]    @abstractmethod
    def compute_value(self, *values: unyt.unyt_array) -> unyt.unyt_array:
        """
        Subclass specific computation method for the derived dataset. Should be
        implemented by each subclass.

        Parameters:
         - *values: unyt.unyt_array
           Datasets that are required for the calculation.

        Returns the computed values as a unyt.unyt_array.
        """
        raise NotImplementedError("This calculation has not been implemented!")


[docs]class VelocityDispersion(CatalogueDerivedDataset):
    """
    CatalogueDerivedDataset that computes the 1D velocity dispersion from the
    full 3D velocity dispersion matrix.
    """

[docs]    def compute_value(
        self, velocity_dispersion_matrix: unyt.unyt_array
    ) -> unyt.unyt_array:
        """
        Calculate the 1D velocity dispersion from the velocity dispersion matrix.

        The velocity dispersion matrix in SOAP consists of the 6 non-trivial
        elements of
          V_{ij} = Sigma_p (v_{p,i} - <v>_i)**2 * (v_{p,j} - <v>_j)**2,
        where v_p is the particle velocity and <v> is a reference velocity.

        Since this is a symmetric matrix, SOAP only outputs (in this order)
          V_XX, V_YY, V_ZZ, V_XY, V_XZ, V_YZ

        The 1D velocity dispersion output by VR is
          sqrt(V_XX + V_YY + V_ZZ)

        Parameters:
         - velocity_dispersion_matrix: unyt.unyt_array
           SOAP velocity dispersion matrix.

        Returns the 1D velocity dispersion as a unyt.unyt_array.
        """
        return np.sqrt(velocity_dispersion_matrix[:, 0:3].sum(axis=1))


[docs]class CatalogueGroup(CatalogueElement):
    """
    Representation of an HDF5 group in the SOAP catalogue.

    A CatalogueGroup contains other groups or datasets. Datasets are exposed as
    new attributes for the CatalogueGroup object, so that things like
      so.v200_crit.totalmass
    map directly to the CatalogueDataset::get_value() function that retrieves
      SOAP_catalogue["SO/200_crit/TotalMass"][:]

    Note that attribute names cannot start with a number, so we use the
    convention that numeric group names are preceded by a 'v' (for value).
    """

    # elements in the group. Can be either other groups or (derived) datasets.
    elements: List[CatalogueElement]
    # properties that will be registered as extra attributes for this object
    properties: Dict

    def __init__(self, file_name: Path, name: str, handle: h5py.File):
        """
        Constructor.

        Parameters:
         - file_name: Path
           Path to the SOAP catalogue file.
         - name: str
           Path of the dataset in the SOAP catalogue file.
         - handle: h5py.File
           HDF5 file handle. Used to avoid having to open and close the file
           in the constructor.
        """
        super().__init__(file_name, name)

        self.elements = []
        self._register_elements(handle)
        self._register_properties()
        self._register_extra_properties()

    def _register_elements(self, handle: h5py.File):
        """
        Create CatologueGroup and CatalogueDataset objects for all the elements
        of this group in the SOAP HDF5 file.
        """
        h5group = handle[self.name] if self.name != "" else handle["/"]
        for (key, h5obj) in h5group.items():
            if isinstance(h5obj, h5py.Group):
                el = CatalogueGroup(self.file_name, f"{self.name}/{key}", handle)
                dynamically_register_properties(el)
                self.elements.append(el)
            elif isinstance(h5obj, h5py.Dataset):
                self.elements.append(
                    CatalogueDataset(self.file_name, f"{self.name}/{key}", handle)
                )

    def __str__(self) -> str:
        """
        String representation of this class.
        """
        return (
            "CatalogueGroup containing the following elements:"
            f" {[el.name for el in self.elements]}"
        )

    def _register_properties(self):
        """
        Register all elements of this group as attributes for this object.
        CatalogueGroup elements are simply registered as an attribute, while
        for CatalogueDataset objects we create a dictionary containing custom
        property() functions that can later be assigned to this object using
        dynamically_register_properties().
        """
        self.properties = {}
        for el in self.elements:
            basename = el.name.split("/")[-1].lower()
            # attribute names cannot start with a number
            if basename[0].isnumeric():
                basename = f"v{basename}"
            if isinstance(el, CatalogueGroup):
                setattr(self, basename, el)
            elif isinstance(el, CatalogueDataset):
                self.properties[basename] = (
                    el,
                    property(el.get_value, el.set_value, el.del_value),
                )

    def _register_extra_properties(self):
        """
        Register derived properties that were present in the old VR catalogue
        but not in the SOAP catalogue.
        These could also use registration functions, but that would affect the
        pipeline.

        In practice, the only property that needs this special treatment is
        'stellarvelocitydispersion'. The reason is that SOAP contains the full
        velocity dispersion matrix, while VR only outputs the square root of
        the trace of this matrix, which is the quantity that is used in the
        pipeline.
        """
        try:
            stellar_velocity_dispersion_matrix = self.properties[
                "stellarvelocitydispersionmatrix"
            ][0]
            el = VelocityDispersion(
                self.file_name,
                "stellarvelocitydispersion",
                [stellar_velocity_dispersion_matrix],
            )
            self.elements.append(el)
            self.properties["stellarvelocitydispersion"] = (
                el,
                property(el.get_value, el.set_value, el.del_value),
            )
        except KeyError:
            pass


[docs]def dynamically_register_properties(group: CatalogueGroup):
    """
    Trick an object into thinking it is of a different class that has additional
    properties, based on the properties contained in the group.properties
    dictionary.

    Concrete example: suppose 'group' enters this method as
     group.elements = [CatalogueDataset<a>, CatalogueDataset<b>]
     group.properties= {"a_name": (CatalogueDataset<a>, CatalogueDataset<a>.value),
                        "b_name": (CatalogueDataset<a>, CatalogueDataset<a>.value)},
    where we have used
     CatalogueDataset<x>.value
    as a shorthand for
     property(CatalogueDataset<x>.get_value,
              CatalogueDataset<x>.set_value,
              CatalogueDataset<x>.del_value)
    After this method acts on 'group', it will look like this:
     group.a_name = CatalogueDataset<a>.get_value
     group.b_name = CatalogueDataset<b>.get_value
    """

    # name for the new class
    # by using the full path of the group in the SOAP catalogue file, we
    # guarantee that this name is unique
    class_name = f"{group.__class__.__name__}{group.name.replace('/', '_')}"
    # create the new properties dictionary that we will use to overwrite the
    # existing properties for this object
    props = {name: value[1] for (name, value) in group.properties.items()}
    # now create a new class that uses these properties
    child_class = type(class_name, (group.__class__,), props)

    # change the class type of 'group', tricking it into thinking it is now an
    # object of this class
    group.__class__ = child_class


[docs]class SOAPCatalogue(Catalogue):
    """
    Catalogue specialisation for a SOAP catalogue.
    """

    # Path to the SOAP catalogue file
    file_name: Path
    # Root CatalogueGroup, containing all groups on the root level in the
    # catalogue file.
    root: CatalogueGroup
    # Set keeping track of which datasets in the catalogue were actually used
    # Useful for debugging.
    names_used: Set[str]

    def __init__(self, file_name: Path):
        """
        Constructor.

        Parameters:
         - file_name: Path
           Path to the SOAP catalogue file.
        """
        super().__init__("SOAP")
        self.file_name = file_name
        self.names_used = set()
        self._register_quantities()

[docs]    def print_fields(self):
        """
        Debugging function used to output the fields that were actually
        accessed sicne the catalogue was opened.
        """
        print("SOAP catalogue fields used:")
        for name in self.names_used:
            print(f"  {name}")

    def _register_quantities(self):
        """
        Open the SOAP catalogue file and recursively create CatalogueGroup and
        CatalogueDataset objects for the HDF5 groups and datasets in it.

        Also read some relevant metadata, like the box size and the cosmology.
        """
        with h5py.File(self.file_name, "r") as handle:
            self.root = CatalogueGroup(self.file_name, "", handle)
            cosmology = handle["SWIFT/Cosmology"].attrs
            # set up a dummy units object for compatibility with the old VR API
            self.units = SimpleNamespace()
            self.a = cosmology["Scale-factor"][0]
            self.units.scale_factor = cosmology["Scale-factor"][0]
            self.z = cosmology["Redshift"][0]
            self.units.redshift = cosmology["Redshift"][0]

            swift_units = SWIFTUnitsMockup(dict(handle["SWIFT/Units"].attrs))
            self.cosmology = swift_cosmology_to_astropy(dict(cosmology), swift_units)

            # get the box size and length unit from the SWIFT header and unit metadata
            boxsize = handle["SWIFT/Header"].attrs["BoxSize"][0]
            boxsize_unit = (
                handle["SWIFT/InternalCodeUnits"].attrs["Unit length in cgs (U_L)"][0]
                * unyt.cm
            ).in_base("galactic")
            boxsize *= boxsize_unit
            physical_boxsize = self.a * boxsize
            self.units.box_length = boxsize
            self.units.comoving_box_volume = boxsize ** 3
            self.units.period = physical_boxsize
            self.units.physical_box_volume = physical_boxsize ** 3
            self.units.cosmology = self.cosmology

[docs]    def get_SOAP_quantity(self, quantity_name: str) -> unyt.unyt_array:
        """
        Get the quantity with the given name from the catalogue.

        Quantities should be addressed using their full path in the catalogue
        file, with the convention that the name is fully written in small caps
        and that '/' is replaced with '.'. Since attribute names cannot start
        with a digit, we additionally add a 'v' in between '.' and any digit.

        Parameters:
         - quantity_name: str
           Full path to the quantity in the SOAP catalogue.

        Returns the corresponding quantity as a unyt.unyt_array.
        """
        path = [
            f"v{path_part}" if path_part[0].isnumeric() else path_part
            for path_part in quantity_name.split(".")
        ]
        value = reduce(getattr, path, self.root)
        self.names_used.add(quantity_name)
        return value

[docs]    def get_quantity(self, quantity_name: str) -> unyt.unyt_array:
        """
        Get the quantity with the given name from the catalogue.

        This version uses a fallback mechanism to deal with quantities that are
        addressed using the wrong name, i.e. quantities for which the old VR
        catalogue name is used.
        We first try to use the parent class get_quantity() version that assumes
        all datasets are simply exposed as attributes. If this fail, we use the
        SOAP catalogue version above. If that fails too, we try to find a SOAP
        equivalent for the given quantity name in the VR_to_SOAP translator
        function. If that fails to, we bail out with a NotImplementedError.

        Parameters:
         - quantity_name: str
           Full path to the quantity in the SOAP catalogue.

        Returns the corresponding quantity as a unyt.unyt_array.
        """
        try:
            return super().get_quantity(quantity_name)
        except AttributeError:
            pass
        try:
            return self.get_SOAP_quantity(quantity_name)
        except AttributeError:
            pass
        SOAP_quantity_name, colidx = VR_to_SOAP(quantity_name)
        quantity = self.get_SOAP_quantity(SOAP_quantity_name)
        if colidx >= 0:
            return quantity[:, colidx]
        else:
            return quantity