Source code for isopy.toolbox.regress

from matplotlib.axes import Axes as _Axes
from matplotlib.patches import Polygon as _Polygon
from scipy import stats
import numpy as np
from isopy import core
from isopy import toolbox
from isopy import array_functions


__add__ = ['linregress', 'yorkregress', 'yorkregress2', 'yorkregress3', 'yorkregress95', 'yorkregress99']

import isopy.checks


[docs]def linregress(x, y):
    """
    Calculate a linear least-squares regression for two sets of measurements.

    Uses ``scipy.stats.linregress(x, y)`` for the regression. See
    `here<https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.linregress.html>`_
    for the scipy documentation.

    Parameters
    ----------
    x, y : numpy_array_like
        Data points through which the regression should be calculated.


    Returns
    -------
    result : LinregressResult
        The returned object contains the following attributes:

        * **slope** - Slope of the regression line.
        * **intercept** - Intercept of the regression line.
        * **slope_se** - Standard error of the slope.
        * **intercept_se** - Standard error of the intercept.
        * **df** - The degrees of freedom of the regression (N - 2).

        The returned object also contains the following methods:

        * **y(x)** - Returns the value of y at *x*
        * **label(sigfig)** - Returns a string in the format "y=mx + c" with *sigfig* significant figures.

    """
    #  * **pvalue** - Two-sided p-value for a hypothesis test whose null hypothesis is that the slope is zero, using Wald Test with t-distribution of the test statistic.
    # * **rvalue** - Correlation coefficient.
    result = stats.linregress(x, y)

    return LinregressResult(result.slope, result.intercept, result.stderr, result.intercept_stderr, len(x) - 2)

[docs]def yorkregress(x, y, xerr, yerr, r=0, err_ci = None, err_zscore=None, result_ci = None, result_zscore=None, tol=1e-10):
    """
    Calculate the slope and intercept taking into account the uncertainty on x and y.

    Uncertainties on the slope and intercept are given as standard errors. Based on the formulas from `York et al. (2004)
    American Journal of Physics 72, 367 <https://doi.org/10.1119/1.1632486>`_.

    If neither *err_ci* or *err_zscore* are given it assumes the uncertainties represent 1 SD/SE. If neither
    *result_ci* or *result_zscore* are giuen the *err_ci*/*err_zscore* is also used for the
    uncertainty on the slope and intercept.

    The functions *yorkregress2* and *yorkregress3* are also avaliable with preset values for *err_zscore*
    of 2 and 3 respectively. Similarly, *yorkregress95* and *yorkregress99* exists with preset values
    *err_ci* 0.95 and 0.99.

    Parameters
    ----------
    x, y : numpy_array_like
        Data points through which the regression should be calculated.
    xerr, yerr : numpy_array_like
        Uncertainties for *x* and *y* values. Can be an array with the same size as *x* and *y* or
        a scalar value which will be used for every datapoint.
    r : float, optional
        Correlation coefficient between errors in x and y. Default value is ``0``.
    err_ci : float, optional
        The confidence interval of *xerr* and *yerr*.
    err_zscore : float, optional
        The zscore of *xerr* and *yerr*.
    result_ci : float, optional
        The desired confidence interval of the uncertainty on the slope and intercept.
    result_zscore : float, optional
        The desired zscore of the uncertainty on the slope and intercept.
    tol : float, optional
        Tolerance for fit. Default value is ``1e-10``.

    Returns
    -------
    result : YorkregressResult
        The returned object contains the following attributes:

        * **slope** - The slope of the regression
        * **intercept** - The y-axis intercept of the regression
        * **slope_se** - The standard error of the slope
        * **intercept_se** - The standard error of the y-axis intercept
        * **msdw** - The mean square weighted deviate of the regression.
        * **df** - The degrees of freedom of the regression (N - 2).
        * **pvalue** - The two-tailed probability of the chi-squared value with *df* degrees of freedom.

        The returned object also contains the following methods:

        * **y(x)** - Returns the value of y at *x*
        * **yerr(x)** -  Returns the standard error on y at *x*.
        * **label(sigfig)** - Returns a string in the format "y=mx + c, msdw" with *sigfig* significant figures.
    """
    #   * **sumW** - The sum of the weights.
    #   * **xbar** - The weighted mean of x.

    x = isopy.checks.check_type('x', x, np.ndarray, coerce=True, coerce_into=np.array)
    y = isopy.checks.check_type('y', y, np.ndarray, coerce=True, coerce_into=np.array)
    xerr = isopy.checks.check_type('xerr', xerr, np.ndarray, np.float64, coerce=True,
                                   coerce_into=[np.float64, np.array])
    yerr = isopy.checks.check_type('yerr', yerr, np.ndarray, np.float64, coerce=True,
                                   coerce_into=[np.float64, np.array])
    r = isopy.checks.check_type('r', r, np.ndarray, np.float64, coerce=True,
                                coerce_into=[np.float64, np.array])
    tol = isopy.checks.check_type('tol', tol, np.float64, coerce=True)

    if err_ci is not None and err_zscore is not None:
        raise ValueError('Both error ci and zscore was given')
    if result_ci is not None and result_zscore is not None:
        raise ValueError('Both result ci and zscore was given')
    if err_ci is not None and (err_ci < 0 or err_ci > 1):
        raise ValueError('error confidence interval must be between 0 and 1')
    if result_ci is not None and (result_ci < 0 or result_ci > 1):
        raise ValueError('result confidence interval must be between 0 and 1')

    X = x
    Y = y
    Xerr = xerr
    Yerr = yerr

    if r > 1 or r < 0:
        raise ValueError('r must be between 0 and 1')
    if X.ndim != 1:
        raise ValueError('parameter "x": must have 1 dimension not {}'.format(X.ndim))
    if Y.ndim != 1:
        raise ValueError('parameter "y": must have 1 dimension not {}'.format(X.ndim))

    if X.size != Y.size:
        raise ValueError('x and y must have the same size')

    if not isinstance(Xerr, np.ndarray):
        Xerr = np.ones(X.size) * Xerr
    elif Xerr.size != X.size:
        raise ValueError('xerr and x must have the same size')

    if not isinstance(Yerr, np.ndarray):
        Yerr = np.ones(Y.size) * Yerr
    elif Yerr.size != Y.size:
        raise ValueError('yerr and y must have the same size')

    if err_ci is not None:
        ci = array_functions.calculate_ci(err_ci)
        Xerr = Xerr / ci
        Yerr = Yerr / ci
        if result_ci is None and result_zscore is None:
            result_ci = err_ci
    elif err_zscore is not None:
        Xerr = Xerr / err_zscore
        Yerr = Yerr / err_zscore
        if result_ci is None and result_zscore is None:
            result_zscore = err_zscore

    # Do calculations
    wX = 1 / (Xerr ** 2)
    wY = 1 / (Yerr ** 2)

    k = np.sqrt(wX * wY)
    slope = (np.mean(X * Y) - np.mean(X) * np.mean(Y)) / (np.mean(X ** 2) - np.mean(X) ** 2)

    # Iteratively converge self.slope until tol is reached
    for i in range(1000):
        W = (wX * wY) / (wX + (slope ** 2) * wY - 2 * slope * r * k)
        Xbar = np.sum(W * X) / np.sum(W)
        Ybar = np.sum(W * Y) / np.sum(W)
        U = X - Xbar
        V = Y - Ybar
        beta = W * (U / wY + (slope * V) / wX - (slope * U + V) * (r / k))

        b2 = np.sum(W * beta * V) / np.sum(W * beta * U)
        dif = np.abs(b2 / slope - 1)
        slope = b2
        if dif < tol: break
    else:
        raise ValueError('Unable to calculate a fit after 1000 iterations.')

    # Calculate self.intercept
    intercept = Ybar - slope * Xbar

    # Calculate adjusted points
    x = Xbar + beta
    xbar = np.sum(W * x) / np.sum(W)
    u = x - xbar

    y = Ybar + slope * beta
    ybar = np.sum(W * y) / np.sum(W)
    v = y - ybar

    # Calculate the uncertianty on the slope
    slope_error = np.sqrt(1 / np.sum(W * (u ** 2)))
    intercept_error = np.sqrt(1 / np.sum(W) + ((0 - xbar) ** 2) * (slope_error ** 2))

    # Goodness of fit
    S = np.sum(W * (Y - slope * X - intercept) ** 2)

    dof = x.size - 2
    if dof > 0:
        #mXY = np.array([X - x, Y - y]).T.reshape(-1, 1, 2)
        #mXYerr = np.array([Xerr * Xerr, r * Xerr * Yerr, r * Xerr * Yerr, Yerr * Yerr]).T.reshape(-1, 2, 2)
        #miXYerr = np.linalg.inv(mXYerr)
        #mXYt = mXY.reshape(-1, 2, 1)
        #rmXY = np.sum(mXY @ miXYerr @ mXYt) same as S

        msdw = S / dof
        p = 1 - stats.chi2.cdf(S, dof)
        p2 = (p-0.5)*2
    else:
        msdw = 1
        p2 = 1

    if result_ci is not None:
        slope_se = slope_error * stats.norm.ppf(0.5 + result_ci / 2)
    elif result_zscore is not None:
        slope_se = slope_error * result_zscore
    else:
        slope_se = slope_error

    intercept_se = np.sqrt(1 / np.sum(W) + ((0 - xbar) ** 2) * (slope_se ** 2))

    return YorkregressResult(slope, intercept, slope_se, intercept_se, dof, msdw, p2,
                             np.sum(W), xbar)


yorkregress1 = core.partial_func(yorkregress, 'yorkregress2', err_zscore=1, result_zscore=1)
yorkregress2 = core.partial_func(yorkregress, 'yorkregress2', err_zscore=2, result_zscore=2)
yorkregress3 = core.partial_func(yorkregress, 'yorkregress3', err_zscore=3, result_zscore=3)
yorkregress95 = core.partial_func(yorkregress, 'yorkregress95', err_ci=0.95, result_ci=0.95)
yorkregress99 = core.partial_func(yorkregress, 'yorkregress99', err_ci=0.99, result_ci=0.99)


class LinregressResult:
    def __init__(self, slope, intercept, slope_se, intercept_se, df):
        self.slope = slope
        self.intercept = intercept
        self.slope_se = slope_se
        self.intercept_se = intercept_se
        self.df = df

    def __repr__(self):
        return (f'{self.__class__.__name__}(slope={self.slope}, intercept={self.intercept}, '
                f'slope_se={self.slope_se}, intercept_se={self.intercept_se}, '
                f'df={self.df})')

    def y(self, x):
        """
        Return y at *x*.
        """
        return self.slope * x + self.intercept

    def y_se(self, x):
        """
        Currently not avaliable for linear regressions. Raises ``NotImplementedError``.
        """
        raise NotImplementedError('"y_se" is currently not avaliable for linear regressions')

    def label(self, sigfig=5):
        label = 'y='
        if np.isnan(self.slope):
            label = f'{label}(nan±nan) + (nan±nan)'
        else:
            label = f'{label}({toolbox.plot._format_sigfig(self.slope, sigfig, self.slope_se)}'
            label = f'{label}±{toolbox.plot._format_sigfig(self.slope_se, sigfig, self.slope_se)})x'
            label = f'{label} + ({toolbox.plot._format_sigfig(self.intercept, sigfig, self.intercept_se)}'
            label = f'{label}±{toolbox.plot._format_sigfig(self.intercept_se, sigfig, self.intercept_se)})'
        return label

class YorkregressResult(LinregressResult):
    def __init__(self, slope, intercept, slope_se, intercept_se, df, msdw, pvalue, sumW, xbar):
        self.slope = slope
        self.intercept = intercept
        self.slope_se = slope_se
        self.intercept_se = intercept_se
        self.df = df
        self.msdw = msdw
        self.pvalue = pvalue
        self._sumW = sumW
        self._xbar = xbar

    def __repr__(self):
        return (f'{self.__class__.__name__}(slope={self.slope}, intercept={self.intercept}, '
                f'slope_se={self.slope_se}, intercept_se={self.intercept_se}, df={self.df}, '
                f'msdw={self.msdw}, pvalue={self.pvalue})')

    def y_se(self, x):
        """
        Return the standard error of y at *x*.
        """
        return np.sqrt(1 / self._sumW + ((x - self._xbar) ** 2) * (self.slope_se ** 2))

    def label(self, sigfig=5):
        label = 'y='
        if np.isnan(self.slope):
            label = f'{label}(nan±nan) + (nan±nan), msdw=nan'
        else:
            label = f'{label}({toolbox.plot._format_sigfig(self.slope, sigfig, self.slope_se)}'
            label = f'{label}±{toolbox.plot._format_sigfig(self.slope_se, sigfig, self.slope_se)})x'
            label = f'{label} + ({toolbox.plot._format_sigfig(self.intercept, sigfig, self.intercept_se)}'
            label = f'{label}±{toolbox.plot._format_sigfig(self.intercept_se, sigfig, self.intercept_se)})'
            label = f'{label}, msdw={self.msdw:.2f}'
        return label