Source code for ipsuite.analysis.model.plots

import typing

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.interpolate import interpn
from scipy.optimize import curve_fit
from scipy.stats import foldnorm, gaussian_kde



[docs]
def density_scatter(ax, x, y, bins, **kwargs) -> None:
    """Create a scatter plot colored by 2d histogram density.

    Parameters
    ----------
    ax: matplotlib.axes.Axes
    x: np.ndarray
    y: np.ndarray
    bins: int
    kwargs
        any kwargs passed to 'ax.scatter'

    Returns
    -------

    References
    ----------
    Adapted from https://stackoverflow.com/a/53865762/10504481

    """
    # convert e.g. DataFrame to numpy array values
    x = np.array(x)
    y = np.array(y)

    if "cmap" not in kwargs:
        kwargs["cmap"] = "viridis"

    data, x_e, y_e = np.histogram2d(x, y, bins=bins, density=True)
    points = (0.5 * (x_e[1:] + x_e[:-1]), 0.5 * (y_e[1:] + y_e[:-1]))
    xi = np.vstack([x, y]).T
    z = interpn(points, data, xi, method="splinef2d", bounds_error=False)
    # To be sure to plot all data
    z[np.where(np.isnan(z))] = 0.0
    # Sort the points by density, so that the densest points are plotted last
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]
    ax.scatter(x, y, c=z, **kwargs)




[docs]
def get_figure(
    true,
    prediction,
    datalabel: str,
    xlabel: str,
    ylabel: str,
    ymax: typing.Optional[float] = None,
    figsize: tuple = (10, 7),
    density=True,
) -> plt.Figure:
    """Create a correlation plot for true, prediction values.

    Parameters
    ----------
    true: the true values
    prediction: the predicted values
    datalabel: str, the label for the prediction, e.g. 'MAE: 0.123 eV'
    xlabel: str, the xlabel
    ylabel: str, the xlabel
    figsize: tuple, size of the figure

    Returns
    -------
    plt.Figure

    """
    sns.set()
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot(true, np.zeros_like(true), color="grey", zorder=0)
    bins = 25
    if true.shape[0] < 20 or not density:
        # don't use density for very small datasets
        ax.scatter(true, prediction, marker="x", s=20.0, label=datalabel)
    else:
        density_scatter(
            ax, true, prediction, bins=bins, marker="x", s=20.0, label=datalabel
        )
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if ymax:
        ax.set_ylim([-ymax, ymax])
    ax.legend()
    return fig




[docs]
def get_calibration_figure(
    error,
    std,
    markersize: float = 3.0,
    datalabel=None,
    forces=False,
    figsize: tuple = (10, 7),
):
    """Log-log plot of errors vs predicted standard deviations with quantiles
    for a linearly increasing noise level.
    """
    fig, ax = plt.subplots(1, 1, figsize=figsize, dpi=300)

    x = np.linspace(1e-6, 5e3, 5)
    noise_level_2 = x

    quantiles_lower_01 = [foldnorm.ppf(0.15, 0.0, 0.0, i) for i in noise_level_2]
    quantiles_upper_01 = [foldnorm.ppf(0.85, 0.0, 0.0, i) for i in noise_level_2]
    quantiles_lower_05 = [foldnorm.ppf(0.05, 0.0, 0.0, i) for i in noise_level_2]
    quantiles_upper_05 = [foldnorm.ppf(0.95, 0.0, 0.0, i) for i in noise_level_2]
    quantiles_lower_005 = [foldnorm.ppf(0.005, 0.0, 0.0, i) for i in noise_level_2]
    quantiles_upper_005 = [foldnorm.ppf(0.995, 0.0, 0.0, i) for i in noise_level_2]

    ax.scatter(
        std,
        error,
        s=markersize,
        alpha=0.3,
        color="tab:blue",
        rasterized=True,
        linewidth=0.0,
        label=datalabel,
    )
    ax.loglog()
    ax.plot(x, quantiles_upper_05, color="gray", alpha=0.5)
    ax.plot(x, quantiles_lower_05, color="gray", alpha=0.5)
    ax.plot(x, quantiles_upper_01, color="gray", alpha=0.5)
    ax.plot(x, quantiles_lower_01, color="gray", alpha=0.5)
    ax.plot(x, quantiles_upper_005, color="gray", alpha=0.5)
    ax.plot(x, quantiles_lower_005, color="gray", alpha=0.5)

    ax.plot(
        np.logspace(-10, 100.0), np.logspace(-10, 100.0), linestyle="--", color="grey"
    )
    xlower = max(np.min(std) / 1.5, 1e-7)
    ylower = max(np.min(error) / 1.5, 1e-7)
    ax.set_xlim(xlower, np.max(std) * 1.5)
    ax.set_ylim(ylower, np.max(error) * 1.5)

    if forces:
        xlabel = r"$\sigma_{f_{i\alpha}}(A)$ [meV/$\AA$] "
        ylabel = r"$|\Delta f_{i\alpha}(A)|$ [meV/$\AA$] "
    else:
        xlabel = r"$\sigma_{E_{i}}(A)$ [meV/atom] "
        ylabel = r"$|\Delta E_{i}(A)|$ [meV/atom] "

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if datalabel:
        ax.legend()
    return fig




[docs]
def gauss(x, *p):
    m, s = p
    return np.exp(-(((x - m) / s) ** 2) * 0.5) / np.sqrt(2 * np.pi * s**2)




[docs]
def slice_ensemble_uncertainty(true, pred_ens, slice_start, slice_end):
    pred_mean = np.mean(pred_ens, axis=1)
    pred_std = np.std(pred_ens, axis=1)

    isel = np.where((slice_start < pred_std) & (pred_std < slice_end))[0]

    error_true = np.reshape(true[isel] - pred_mean[isel], -1)
    error_pred = np.reshape(pred_ens[isel, :] - pred_mean[isel, np.newaxis], -1)
    return error_true, error_pred




[docs]
def slice_uncertainty(true, pred_mean, pred_std, slice_start, slice_end):
    isel = np.where((slice_start < pred_std) & (pred_std < slice_end))[0]

    error_true = np.reshape(true[isel] - pred_mean[isel], -1)
    error_pred = pred_std[isel]
    return error_true, error_pred




[docs]
def get_gaussianicity_figure(error_true, error_pred, forces=True):
    """Plots empirical and predicted error distributions.
    If possible, it also tries to fit a gaussian to the empirical distribution.
    """
    true_kde_sel = gaussian_kde(error_true)
    ens_kde_sel = gaussian_kde(error_pred)

    bounds = 1.5 * max(np.max(np.abs(error_true)), np.max(np.abs(error_pred)))

    fig, ax = plt.subplots()

    xgrid = np.linspace(-bounds, bounds, 400)
    ax.set_xlim([-bounds, bounds])

    ens_sel = ens_kde_sel(xgrid)
    true_sel = true_kde_sel(xgrid)

    try:
        guess = [0.0, 100]
        coeff, _ = curve_fit(gauss, xgrid, true_sel, p0=guess)
        std = coeff[1]
        ax.semilogy(xgrid, gauss(xgrid, 0, std), "k--", label="Gaussian")

    except RuntimeError:
        print("Curve fit failed, only plotting distributions")

    ax.semilogy(xgrid, true_sel, "r-", label="empirical")
    ax.semilogy(xgrid, ens_sel, "b-", label="predicted")
    ymax = 5 * max(np.max(true_sel), np.max(ens_sel))
    ax.set_ylim(1e-6, ymax)
    ax.set_yscale("log")

    if forces:
        xlabel = r"$\Delta (S)$ / meV/Ang"
        ylabel = r"$p(\Delta | S)$"
    else:
        xlabel = r"$\Delta (S)$ / meV/atom"
        ylabel = r"$p(\Delta | S)$"

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.legend()
    return fig




[docs]
def get_hist(data, label, xlabel, ylabel) -> typing.Tuple[plt.Figure, plt.Axes]:
    fig, ax = plt.subplots()

    sns.histplot(
        data,
        ax=ax,
        stat="percent",
        label=label,
    )
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.legend()

    return fig, ax




[docs]
def get_histogram_figure(
    bin_edges,
    counts,
    datalabel: str,
    xlabel: str,
    ylabel: str,
    x_lim: typing.Tuple[float, float] = None,
    y_lim: typing.Tuple[float, float] = None,
    logy_scale=True,
    figsize: tuple = (10, 7),
) -> plt.Figure:
    """Creates a Matplotlib figure based on precomputed bin edges and counts.

    Parameters
    ----------
    bin_edges: np.array
        Edges of the histogram bins.
    counts: np.array
        Number of occurrences in each bin.
    datalabel: str
        Labels for the figure legend.
    xlabel: str
        X-axis label.
    ylabel: str
        Y-axis label.
    x_lim: tuple
        X-axis limits.
    y_lim: tuple
        Y-axis limits.
    figsize: tuple
        Size of the Matplotlib figure
    """
    sns.set()
    fig, ax = plt.subplots(figsize=figsize)

    ax.stairs(counts, bin_edges, label=datalabel, fill=True)

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.legend()
    if logy_scale:
        ax.set_yscale("log")
    if x_lim is not None:
        ax.set_xlim(x_lim)
    if y_lim is not None:
        ax.set_ylim(y_lim)
    return fig