Source code for velociraptor.tools.lines

"""
Tools to generate various lines from datasets.
"""

import unyt
import numpy as np
import sys
from typing import List


[docs]def binned_mean_line(
    x: unyt.unyt_array,
    y: unyt.unyt_array,
    x_bins: unyt.unyt_array,
    minimum_in_bin: int = 3,
    return_additional: bool = False,
    minimum_additional_points: int = 0,
):
    """
    Gets a mean (y) line, binned in the x direction.

    Parameters
    ----------

    x: unyt.unyt_array
        Horizontal values, to be binned.

    y: unyt.unyt_array
        Vertical values, to have the mean calculated in the x-bins

    x_bins: unyt.unyt_array
        Horizontal bin edges. Must have the same units as x.

    minimum_in_bin: int, optional
        Minimum number of items in a bin to return that bin. If a bin has
        fewer values than this, it is excluded from the return values.
        Default: 3.

    return_additional: bool, optional
        Boolean. If true, makes the function return x and y data points that
        lie in the bins where the number of data points is smaller than
        minimum_in_bin, and any points that are higher than the highest bin
        edge. Default: false.

    minimum_additional_points: int, optional
        Minimum number of additional data points with the highest values of x to return.
        Has to be used with return_additional=True. If set to N, then at least N
        additional data points will always be present in the plot, regardless of how the
        adaptive binning is done. The adaptive binning is stopped at the lowest value of
        x among the additional data points so that these points and the mean line do
        not overlap.


    Returns
    -------

    bin_centers: unyt.unyt_array
        The centers of the bins (taken to be the linear mean of the bin edges).

    means: unyt.unyt_array
        Vertical mean values within the bins.

    standard_deviation: unyt.unyt_array
        Standard deviation within the bins, to be shown as scatter.

    additional_x: unyt.unyt_array, optional
        x data points from the bins where the number of data points is smaller
        than minimum_in_bin

    additional_y: unyt.unyt_array, optional
        y data points from the bins where the number of data points is smaller
        than minimum_in_bin


    Notes
    -----

    The return types are such that you can pass this directly to `plt.errorbar`,
    as follows:

    .. code-block:: python

        plt.errorbar(
            *binned_mean_line(x, y, x_bins, 10)
        )

    """

    assert (
        x.units == x_bins.units
    ), "Please ensure that the x values and bins have the same units."

    means = []
    standard_deviations = []
    centers = []
    additional_x = []
    additional_y = []

    # Do we want to have at least 'minimum_additional_points' additional data points in
    # the plot?
    if return_additional and minimum_additional_points > 0:

        # Sort the data along the X axis
        idx_sort = np.argsort(x)
        x, y = x[idx_sort], y[idx_sort]

        # Ensure we don't run out of data points to plot
        num_points_to_plot = min(minimum_additional_points, len(x))

        # Collect 'num_points_to_plot' additional data points
        additional_x += list(x[-num_points_to_plot:].value)
        additional_y += list(y[-num_points_to_plot:].value)

        # Don't use the collected additional data points for the bins
        x = x[:-num_points_to_plot]
        y = y[:-num_points_to_plot]

    hist = np.digitize(x, x_bins)

    for bin in range(1, len(x_bins)):
        indices_in_this_bin = hist == bin
        number_of_items_in_bin = indices_in_this_bin.sum()

        if number_of_items_in_bin >= minimum_in_bin:
            y_values_in_this_bin = y[indices_in_this_bin].value

            means.append(np.mean(y_values_in_this_bin))
            standard_deviations.append(np.std(y_values_in_this_bin))

            # Bin center is computed as the median of the X values of the data points
            # in the bin
            centers.append(np.median(x[indices_in_this_bin].value))

        # If the number of data points in the bin is less than minimum_in_bin,
        # collect these data points if needed
        elif number_of_items_in_bin > 0 and return_additional:
            additional_x += list(x[indices_in_this_bin].value)
            additional_y += list(y[indices_in_this_bin].value)

    # Add any points that are larger:
    above_highest = hist == len(x_bins)
    additional_x += list(x[above_highest].value)
    additional_y += list(y[above_highest].value)

    # If there is nothing to plot as the mean line, add a test point to avoid having
    # empty axes labels or/and wrong axes limits
    if len(means) == 0:
        float_min = sys.float_info.min
        means, centers, standard_deviations = [float_min], [float_min], [float_min]

    means = unyt.unyt_array(means, units=y.units, name=y.name)
    standard_deviations = unyt.unyt_array(
        standard_deviations, units=y.units, name=f"{y.name} ($sigma$)"
    )
    centers = unyt.unyt_array(centers, units=x.units, name=x.name)
    additional_x = unyt.unyt_array(additional_x, units=x.units, name=x.name)
    additional_y = unyt.unyt_array(additional_y, units=y.units, name=y.name)

    if not return_additional:
        return centers, means, standard_deviations
    else:
        return centers, means, standard_deviations, additional_x, additional_y


[docs]def binned_median_line(
    x: unyt.unyt_array,
    y: unyt.unyt_array,
    x_bins: unyt.unyt_array,
    percentiles: List[int] = [16, 84],
    minimum_in_bin: int = 3,
    return_additional: bool = False,
    minimum_additional_points: int = 0,
):
    """
    Gets a median (y) line, binned in the x direction.

    Parameters
    ----------

    x: unyt.unyt_array
        Horizontal values, to be binned.

    y: unyt.unyt_array
        Vertical values, to have the median calculated in the x-bins

    x_bins: unyt.unyt_array
        Horizontal bin edges. Must have the same units as x.

    percentiles: List[int], optional
        Percentiles to return as the positive and negative errors. By
        default these are 16 and 84th percentiles.

    minimum_in_bin: int, optional
        Minimum number of items in a bin to return that bin. If a bin has
        fewer values than this, it is excluded from the return values.
        Default: 3.

    return_additional: bool, optional
        Boolean. If true, makes the function return x and y data points that
        lie in the bins where the number of data points is smaller than
        minimum_in_bin, and any points that are higher than the highest bin
        edge. Default: false.

    minimum_additional_points: int, optional
        Minimum number of additional data points with the highest values of x to return.
        Has to be used with return_additional=True. If set to N, then at least N
        additional data points will always be present in the plot, regardless of how the
        adaptive binning is done. The adaptive binning is stopped at the lowest value of
        x among the additional data points so that these points and the median line do
        not overlap.


    Returns
    -------

    bin_centers: unyt.unyt_array
        The centers of the bins (taken to be the linear mean of the bin edges).

    medians: unyt.unyt_array
        Vertical median values within the bins.

    deviations: unyt.unyt_array
        Deviation from median vertically using the ``percentiles`` defined above.
        This has shape 2xN.

    additional_x: unyt.unyt_array, optional
        x data points from the bins where the number of data points is smaller
        than minimum_in_bin

    additional_y: unyt.unyt_array, optional
        y data points from the bins where the number of data points is smaller
        than minimum_in_bin


    Notes
    -----

    The return types are such that you can pass this directly to `plt.errorbar`,
    as follows:

    .. code-block:: python

        plt.errorbar(
            *binned_median_line(x, y, x_bins, 10)
        )

    """

    assert (
        x.units == x_bins.units
    ), "Please ensure that the x values and bins have the same units."

    medians = []
    deviations = []
    centers = []
    additional_x = []
    additional_y = []

    # Do we want to have at least 'minimum_additional_points' additional data points in
    # the plot?
    if return_additional and minimum_additional_points > 0:

        # Sort the data along the X axis
        idx_sort = np.argsort(x)
        x, y = x[idx_sort], y[idx_sort]

        # Ensure we don't run out of data points to plot
        num_points_to_plot = min(minimum_additional_points, len(x))

        # Collect 'num_points_to_plot' additional data points
        additional_x += list(x[-num_points_to_plot:].value)
        additional_y += list(y[-num_points_to_plot:].value)

        # Don't use the collected additional data points for the bins
        x = x[:-num_points_to_plot]
        y = y[:-num_points_to_plot]

    hist = np.digitize(x, x_bins)

    for bin in range(1, len(x_bins)):
        indices_in_this_bin = hist == bin
        number_of_items_in_bin = indices_in_this_bin.sum()

        if number_of_items_in_bin >= minimum_in_bin:
            y_values_in_this_bin = y[indices_in_this_bin].value

            medians.append(np.median(y_values_in_this_bin))
            deviations.append(np.percentile(y_values_in_this_bin, percentiles))

            # Bin center is computed as the median of the X values of the data points
            # in the bin
            centers.append(np.median(x[indices_in_this_bin].value))

        # If the number of data points in the bin is less than minimum_in_bin,
        # collect these data points if needed
        elif number_of_items_in_bin > 0 and return_additional:
            additional_x += list(x[indices_in_this_bin].value)
            additional_y += list(y[indices_in_this_bin].value)

    # Add any points that are larger:
    above_highest = hist == len(x_bins)
    additional_x += list(x[above_highest].value)
    additional_y += list(y[above_highest].value)

    # If there is nothing to plot as the median line, add a test point to avoid having
    # empty axes labels or/and wrong axes limits
    if len(medians) == 0:
        float_min = sys.float_info.min
        medians, centers, deviations = (
            [float_min],
            [float_min],
            [[float_min, float_min]],
        )

    medians = unyt.unyt_array(medians, units=y.units, name=y.name)
    # Percentiles actually gives us the values - we want to be able to use
    # matplotlib's errorbar function
    deviations = unyt.unyt_array(
        abs(np.array(deviations).T - medians.value),
        units=y.units,
        name=f"{y.name} {percentiles} percentiles",
    )
    centers = unyt.unyt_array(centers, units=x.units, name=x.name)
    additional_x = unyt.unyt_array(additional_x, units=x.units, name=x.name)
    additional_y = unyt.unyt_array(additional_y, units=y.units, name=y.name)

    if not return_additional:
        return centers, medians, deviations
    else:
        return centers, medians, deviations, additional_x, additional_y