Source code for velociraptor.tools.lines
"""
Tools to generate various lines from datasets.
"""
import unyt
import numpy as np
import sys
from typing import List
[docs]def binned_mean_line(
x: unyt.unyt_array,
y: unyt.unyt_array,
x_bins: unyt.unyt_array,
minimum_in_bin: int = 3,
return_additional: bool = False,
minimum_additional_points: int = 0,
):
"""
Gets a mean (y) line, binned in the x direction.
Parameters
----------
x: unyt.unyt_array
Horizontal values, to be binned.
y: unyt.unyt_array
Vertical values, to have the mean calculated in the x-bins
x_bins: unyt.unyt_array
Horizontal bin edges. Must have the same units as x.
minimum_in_bin: int, optional
Minimum number of items in a bin to return that bin. If a bin has
fewer values than this, it is excluded from the return values.
Default: 3.
return_additional: bool, optional
Boolean. If true, makes the function return x and y data points that
lie in the bins where the number of data points is smaller than
minimum_in_bin, and any points that are higher than the highest bin
edge. Default: false.
minimum_additional_points: int, optional
Minimum number of additional data points with the highest values of x to return.
Has to be used with return_additional=True. If set to N, then at least N
additional data points will always be present in the plot, regardless of how the
adaptive binning is done. The adaptive binning is stopped at the lowest value of
x among the additional data points so that these points and the mean line do
not overlap.
Returns
-------
bin_centers: unyt.unyt_array
The centers of the bins (taken to be the linear mean of the bin edges).
means: unyt.unyt_array
Vertical mean values within the bins.
standard_deviation: unyt.unyt_array
Standard deviation within the bins, to be shown as scatter.
additional_x: unyt.unyt_array, optional
x data points from the bins where the number of data points is smaller
than minimum_in_bin
additional_y: unyt.unyt_array, optional
y data points from the bins where the number of data points is smaller
than minimum_in_bin
Notes
-----
The return types are such that you can pass this directly to `plt.errorbar`,
as follows:
.. code-block:: python
plt.errorbar(
*binned_mean_line(x, y, x_bins, 10)
)
"""
assert (
x.units == x_bins.units
), "Please ensure that the x values and bins have the same units."
means = []
standard_deviations = []
centers = []
additional_x = []
additional_y = []
# Do we want to have at least 'minimum_additional_points' additional data points in
# the plot?
if return_additional and minimum_additional_points > 0:
# Sort the data along the X axis
idx_sort = np.argsort(x)
x, y = x[idx_sort], y[idx_sort]
# Ensure we don't run out of data points to plot
num_points_to_plot = min(minimum_additional_points, len(x))
# Collect 'num_points_to_plot' additional data points
additional_x += list(x[-num_points_to_plot:].value)
additional_y += list(y[-num_points_to_plot:].value)
# Don't use the collected additional data points for the bins
x = x[:-num_points_to_plot]
y = y[:-num_points_to_plot]
hist = np.digitize(x, x_bins)
for bin in range(1, len(x_bins)):
indices_in_this_bin = hist == bin
number_of_items_in_bin = indices_in_this_bin.sum()
if number_of_items_in_bin >= minimum_in_bin:
y_values_in_this_bin = y[indices_in_this_bin].value
means.append(np.mean(y_values_in_this_bin))
standard_deviations.append(np.std(y_values_in_this_bin))
# Bin center is computed as the median of the X values of the data points
# in the bin
centers.append(np.median(x[indices_in_this_bin].value))
# If the number of data points in the bin is less than minimum_in_bin,
# collect these data points if needed
elif number_of_items_in_bin > 0 and return_additional:
additional_x += list(x[indices_in_this_bin].value)
additional_y += list(y[indices_in_this_bin].value)
# Add any points that are larger:
above_highest = hist == len(x_bins)
additional_x += list(x[above_highest].value)
additional_y += list(y[above_highest].value)
# If there is nothing to plot as the mean line, add a test point to avoid having
# empty axes labels or/and wrong axes limits
if len(means) == 0:
float_min = sys.float_info.min
means, centers, standard_deviations = [float_min], [float_min], [float_min]
means = unyt.unyt_array(means, units=y.units, name=y.name)
standard_deviations = unyt.unyt_array(
standard_deviations, units=y.units, name=f"{y.name} ($sigma$)"
)
centers = unyt.unyt_array(centers, units=x.units, name=x.name)
additional_x = unyt.unyt_array(additional_x, units=x.units, name=x.name)
additional_y = unyt.unyt_array(additional_y, units=y.units, name=y.name)
if not return_additional:
return centers, means, standard_deviations
else:
return centers, means, standard_deviations, additional_x, additional_y
[docs]def binned_median_line(
x: unyt.unyt_array,
y: unyt.unyt_array,
x_bins: unyt.unyt_array,
percentiles: List[int] = [16, 84],
minimum_in_bin: int = 3,
return_additional: bool = False,
minimum_additional_points: int = 0,
):
"""
Gets a median (y) line, binned in the x direction.
Parameters
----------
x: unyt.unyt_array
Horizontal values, to be binned.
y: unyt.unyt_array
Vertical values, to have the median calculated in the x-bins
x_bins: unyt.unyt_array
Horizontal bin edges. Must have the same units as x.
percentiles: List[int], optional
Percentiles to return as the positive and negative errors. By
default these are 16 and 84th percentiles.
minimum_in_bin: int, optional
Minimum number of items in a bin to return that bin. If a bin has
fewer values than this, it is excluded from the return values.
Default: 3.
return_additional: bool, optional
Boolean. If true, makes the function return x and y data points that
lie in the bins where the number of data points is smaller than
minimum_in_bin, and any points that are higher than the highest bin
edge. Default: false.
minimum_additional_points: int, optional
Minimum number of additional data points with the highest values of x to return.
Has to be used with return_additional=True. If set to N, then at least N
additional data points will always be present in the plot, regardless of how the
adaptive binning is done. The adaptive binning is stopped at the lowest value of
x among the additional data points so that these points and the median line do
not overlap.
Returns
-------
bin_centers: unyt.unyt_array
The centers of the bins (taken to be the linear mean of the bin edges).
medians: unyt.unyt_array
Vertical median values within the bins.
deviations: unyt.unyt_array
Deviation from median vertically using the ``percentiles`` defined above.
This has shape 2xN.
additional_x: unyt.unyt_array, optional
x data points from the bins where the number of data points is smaller
than minimum_in_bin
additional_y: unyt.unyt_array, optional
y data points from the bins where the number of data points is smaller
than minimum_in_bin
Notes
-----
The return types are such that you can pass this directly to `plt.errorbar`,
as follows:
.. code-block:: python
plt.errorbar(
*binned_median_line(x, y, x_bins, 10)
)
"""
assert (
x.units == x_bins.units
), "Please ensure that the x values and bins have the same units."
medians = []
deviations = []
centers = []
additional_x = []
additional_y = []
# Do we want to have at least 'minimum_additional_points' additional data points in
# the plot?
if return_additional and minimum_additional_points > 0:
# Sort the data along the X axis
idx_sort = np.argsort(x)
x, y = x[idx_sort], y[idx_sort]
# Ensure we don't run out of data points to plot
num_points_to_plot = min(minimum_additional_points, len(x))
# Collect 'num_points_to_plot' additional data points
additional_x += list(x[-num_points_to_plot:].value)
additional_y += list(y[-num_points_to_plot:].value)
# Don't use the collected additional data points for the bins
x = x[:-num_points_to_plot]
y = y[:-num_points_to_plot]
hist = np.digitize(x, x_bins)
for bin in range(1, len(x_bins)):
indices_in_this_bin = hist == bin
number_of_items_in_bin = indices_in_this_bin.sum()
if number_of_items_in_bin >= minimum_in_bin:
y_values_in_this_bin = y[indices_in_this_bin].value
medians.append(np.median(y_values_in_this_bin))
deviations.append(np.percentile(y_values_in_this_bin, percentiles))
# Bin center is computed as the median of the X values of the data points
# in the bin
centers.append(np.median(x[indices_in_this_bin].value))
# If the number of data points in the bin is less than minimum_in_bin,
# collect these data points if needed
elif number_of_items_in_bin > 0 and return_additional:
additional_x += list(x[indices_in_this_bin].value)
additional_y += list(y[indices_in_this_bin].value)
# Add any points that are larger:
above_highest = hist == len(x_bins)
additional_x += list(x[above_highest].value)
additional_y += list(y[above_highest].value)
# If there is nothing to plot as the median line, add a test point to avoid having
# empty axes labels or/and wrong axes limits
if len(medians) == 0:
float_min = sys.float_info.min
medians, centers, deviations = (
[float_min],
[float_min],
[[float_min, float_min]],
)
medians = unyt.unyt_array(medians, units=y.units, name=y.name)
# Percentiles actually gives us the values - we want to be able to use
# matplotlib's errorbar function
deviations = unyt.unyt_array(
abs(np.array(deviations).T - medians.value),
units=y.units,
name=f"{y.name} {percentiles} percentiles",
)
centers = unyt.unyt_array(centers, units=x.units, name=x.name)
additional_x = unyt.unyt_array(additional_x, units=x.units, name=x.name)
additional_y = unyt.unyt_array(additional_y, units=y.units, name=y.name)
if not return_additional:
return centers, medians, deviations
else:
return centers, medians, deviations, additional_x, additional_y