diff --git a/nltools/stats.py b/nltools/stats.py index 1199c617..0b9fa64f 100644 --- a/nltools/stats.py +++ b/nltools/stats.py @@ -72,25 +72,15 @@ from sklearn.utils import check_random_state from sklearn.metrics import pairwise_distances +# Stats imports from burnt-ends +from ends.stats import regress, fdr, holm_bonf, pearson2d as pearson + MAX_INT = np.iinfo(np.int32).max # Optional dependencies sm = attempt_to_import("statsmodels.tsa.arima_model", name="sm") -def pearson(x, y): - """Correlates row vector x with each row vector in 2D array y. - From neurosynth.stats.py - author: Tal Yarkoni - """ - data = np.vstack((x, y)) - ms = data.mean(axis=1)[(slice(None, None, None), None)] - datam = data - ms - datass = np.sqrt(np.sum(datam * datam, axis=1)) - # datass = np.sqrt(ss(datam, axis=1)) - temp = np.dot(datam[1:], datam[0].T) - return temp / (datass[1:] * datass[0]) - - def zscore(df): """zscore every column in a pandas dataframe or series. @@ -109,58 +99,6 @@ def zscore(df): raise ValueError("Data is not a Pandas DataFrame or Series instance") -def fdr(p, q=0.05): - """Determine FDR threshold given a p value array and desired false - discovery rate q. Written by Tal Yarkoni - - Args: - p: (np.array) vector of p-values - q: (float) false discovery rate level - - Returns: - fdr_p: (float) p-value threshold based on independence or positive - dependence - - """ - - if not isinstance(p, np.ndarray): - raise ValueError("Make sure vector of p-values is a numpy array") - if any(p < 0) or any(p > 1): - raise ValueError("array contains p-values that are outside the range 0-1") - - if np.any(p > 1) or np.any(p < 0): - raise ValueError("Does not include valid p-values.") - - s = np.sort(p) - nvox = p.shape[0] - null = np.array(range(1, nvox + 1), dtype="float") * q / nvox - below = np.where(s <= null)[0] - return s[max(below)] if len(below) else -1 - - -def holm_bonf(p, alpha=0.05): - """Compute corrected p-values based on the Holm-Bonferroni method, i.e. step-down procedure applying iteratively less correction to highest p-values. A bit more conservative than fdr, but much more powerful thanvanilla bonferroni. - - Args: - p: (np.array) vector of p-values - alpha: (float) alpha level - - Returns: - bonf_p: (float) p-value threshold based on bonferroni - step-down procedure - - """ - - if not isinstance(p, np.ndarray): - raise ValueError("Make sure vector of p-values is a numpy array") - - s = np.sort(p) - nvox = p.shape[0] - null = 0.05 / (nvox - np.arange(1, nvox + 1) + 1) - below = np.where(s <= null)[0] - return s[max(below)] if len(below) else -1 - - def threshold(stat, p, thr=0.05, return_mask=False): """Threshold test image by p-value from p image @@ -859,67 +797,6 @@ def transform_pairwise(X, y): return np.asarray(X_new), np.vstack((np.asarray(y_new), np.asarray(y_group))).T -def _robust_estimator(vals, X, robust_estimator="hc0", nlags=1): - """ - Computes robust sandwich estimators for standard errors used in OLS computation. Types include: - 'hc0': Huber (1980) sandwich estimator to return robust standard error estimates. - 'hc3': MacKinnon and White (1985) HC3 sandwich estimator. Provides more robustness in smaller samples than HC0 Long & Ervin (2000) - 'hac': Newey-West (1987) estimator for robustness to heteroscedasticity as well as serial auto-correlation at given lags. - - Refs: https://www.wikiwand.com/en/Heteroscedasticity-consistent_standard_errors - https://github.com/statsmodels/statsmodels/blob/master/statsmodels/regression/linear_model.py - https://cran.r-project.org/web/packages/sandwich/vignettes/sandwich.pdf - https://www.stata.com/manuals13/tsnewey.pdf - - Args: - vals (np.ndarray): 1d array of residuals - X (np.ndarray): design matrix used in OLS, e.g. Brain_Data().X - robust_estimator (str): estimator type, 'hc0' (default), 'hc3', or 'hac' - nlags (int): number of lags, only used with 'hac' estimator, default is 1 - - Returns: - stderr (np.ndarray): 1d array of standard errors with length == X.shape[1] - - """ - - if robust_estimator not in ["hc0", "hc3", "hac"]: - raise ValueError("robust_estimator must be one of hc0, hc3 or hac") - - # Make a sandwich! - # First we need bread - bread = np.linalg.pinv(np.dot(X.T, X)) - - # Then we need meat - if robust_estimator == "hc0": - V = np.diag(vals**2) - meat = np.dot(np.dot(X.T, V), X) - - elif robust_estimator == "hc3": - V = np.diag(vals**2) / (1 - np.diag(np.dot(X, np.dot(bread, X.T)))) ** 2 - meat = np.dot(np.dot(X.T, V), X) - - elif robust_estimator == "hac": - weights = 1 - np.arange(nlags + 1.0) / (nlags + 1.0) - - # First compute lag 0 - V = np.diag(vals**2) - meat = weights[0] * np.dot(np.dot(X.T, V), X) - - # Now loop over additional lags - for l in range(1, nlags + 1): - - V = np.diag(vals[l:] * vals[:-l]) - meat_1 = np.dot(np.dot(X[l:].T, V), X[:-l]) - meat_2 = np.dot(np.dot(X[:-l].T, V), X[l:]) - - meat += weights[l] * (meat_1 + meat_2) - - # Then we make a sandwich - vcv = np.dot(np.dot(bread, meat), bread) - - return np.sqrt(np.diag(vcv)) - - def summarize_bootstrap(data, save_weights=False): """Calculate summary of bootstrap samples @@ -946,211 +823,6 @@ def summarize_bootstrap(data, save_weights=False): return output -def _arma_func(X, Y, idx=None, **kwargs): - """ - Fit an ARMA(p,q) model. If Y is a matrix and not a vector, expects an idx argument that refers to columns of Y. Used by regress(). - """ - method = kwargs.pop("method", "css-mle") - order = kwargs.pop("order", (1, 1)) - - maxiter = kwargs.pop("maxiter", 50) - disp = kwargs.pop("disp", -1) - start_ar_lags = kwargs.pop("start_ar_lags", order[0] + 1) - transparams = kwargs.pop("transparams", False) - trend = kwargs.pop("trend", "nc") - - if len(Y.shape) == 2: - model = sm.tsa.arima_model.ARMA(endog=Y[:, idx], exog=X.values, order=order) - else: - model = sm.tsa.arima_model.ARMA(endog=Y, exog=X.values, order=order) - try: - res = model.fit( - trend=trend, - method=method, - transparams=transparams, - maxiter=maxiter, - disp=disp, - start_ar_lags=start_ar_lags, - **kwargs - ) - except: - res = model.fit( - trend=trend, - method=method, - transparams=transparams, - maxiter=maxiter, - disp=disp, - start_ar_lags=start_ar_lags, - start_params=np.repeat(1.0, X.shape[1] + 2), - ) - - return ( - res.params[:-2], - res.tvalues[:-2], - res.pvalues[:-2], - res.df_resid, - res.resid, - ) - - -def regress(X, Y, mode="ols", stats="full", **kwargs): - """This is a flexible function to run several types of regression models provided X and Y numpy arrays. Y can be a 1d numpy array or 2d numpy array. In the latter case, results will be output with shape 1 x Y.shape[1], in other words fitting a separate regression model to each column of Y. - - Does NOT add an intercept automatically to the X matrix before fitting like some other software packages. This is left up to the user. - - This function can compute regression in 3 ways: - - 1. Standard OLS - 2. OLS with robust sandwich estimators for standard errors. 3 robust types of - estimators exist: - - - 'hc0' - classic huber-white estimator robust to heteroscedasticity (default) - - 'hc3' - a variant on huber-white estimator slightly more conservative when sample sizes are small - - 'hac' - an estimator robust to both heteroscedasticity and auto-correlation; - auto-correlation lag can be controlled with the `nlags` keyword argument; default - is 1 - - 3. ARMA (auto-regressive moving-average) model (experimental). This model is fit through statsmodels.tsa.arima_model.ARMA, so more information about options can be found there. Any settings can be passed in as kwargs. By default fits a (1,1) model with starting lags of 2. This mode is **computationally intensive** and can take quite a while if Y has many columns. If Y is a 2d array joblib.Parallel is used for faster fitting by parallelizing fits across columns of Y. Parallelization can be controlled by passing in kwargs. Defaults to multi-threading using 10 separate threads, as threads don't require large arrays to be duplicated in memory. Defaults are also set to enable memory-mapping for very large arrays if backend='multiprocessing' to prevent crashes and hangs. Various levels of progress can be monitored using the 'disp' (statsmodels) and 'verbose' (joblib) keyword arguments with integer values > 0. - - Args: - X (ndarray): design matrix; assumes intercept is included - Y (ndarray): dependent variable array; if 2d, a model is fit to each column of Y separately - mode (str): kind of model to fit; must be one of 'ols' (default), 'robust', or - 'arma' - stats (str): one of 'full', 'betas', 'tstats'. Useful to speed up calculation if - you know you only need some statistics and not others. Defaults to 'full'. - robust_estimator (str,optional): kind of robust estimator to use if mode = 'robust'; default 'hc0' - nlags (int,optional): auto-correlation lag correction if mode = 'robust' and robust_estimator = 'hac'; default 1 - order (tuple,optional): auto-regressive and moving-average orders for mode = 'arma'; default (1,1) - kwargs (dict): additional keyword arguments to statsmodels.tsa.arima_model.ARMA and joblib.Parallel - - Returns: - b: coefficients - se: standard error of coefficients - t: t-statistics (coef/sterr) - p : p-values - df: degrees of freedom - res: residuals - - Examples: - Standard OLS - - >>> results = regress(X,Y,mode='ols') - - Robust OLS with heteroscedasticity (hc0) robust standard errors - - >>> results = regress(X,Y,mode='robust') - - Robust OLS with heteroscedasticty and auto-correlation (with lag 2) robust standard errors - - >>> results = regress(X,Y,mode='robust',robust_estimator='hac',nlags=2) - - Auto-regressive mode with auto-regressive and moving-average lags = 1 - - >>> results = regress(X,Y,mode='arma',order=(1,1)) - - Auto-regressive model with auto-regressive lag = 2, moving-average lag = 3, and multi-processing instead of multi-threading using 8 cores (this can use a lot of memory if input arrays are very large!). - - >>> results = regress(X,Y,mode='arma',order=(2,3),backend='multiprocessing',n_jobs=8) - - """ - - if not isinstance(mode, str): - raise ValueError("mode must be a string") - - if not isinstance(stats, str): - raise ValueError("stats must be a string") - - if mode not in ["ols", "robust", "arma"]: - raise ValueError("Mode must be one of 'ols','robust' or 'arma'") - - if stats not in ["full", "betas", "tstats"]: - raise ValueError("stats must be one of 'full', 'betas', 'tstats'") - - # Make sure Y is a 2-D array - if len(Y.shape) == 1: - Y = Y[:, np.newaxis] - - # Compute standard errors based on regression mode - if mode == "ols" or mode == "robust": - - b = np.dot(np.linalg.pinv(X), Y) - - # Return betas and stop other computations if that's all that's requested - if stats == "betas": - return b.squeeze() - res = Y - np.dot(X, b) - - # Vanilla OLS - if mode == "ols": - sigma = np.std(res, axis=0, ddof=X.shape[1]) - stderr = ( - np.sqrt(np.diag(np.linalg.pinv(np.dot(X.T, X))))[:, np.newaxis] - * sigma[np.newaxis, :] - ) - - # OLS with robust sandwich estimator based standard-errors - elif mode == "robust": - robust_estimator = kwargs.pop("robust_estimator", "hc0") - nlags = kwargs.pop("nlags", 1) - axis_func = [_robust_estimator, 0, res, X, robust_estimator, nlags] - stderr = np.apply_along_axis(*axis_func) - - # Then only compute t-stats at voxels where the standard error is at least .000001 - t = np.zeros_like(b) - t[stderr > 1.0e-6] = b[stderr > 1.0e-6] / stderr[stderr > 1.0e-6] - - # Return betas and ts and stop other computations if that's all that's requested - if stats == "tstats": - return b.squeeze(), t.squeeze() - df = np.array([X.shape[0] - X.shape[1]] * t.shape[1]) - p = 2 * (1 - t_dist.cdf(np.abs(t), df)) - - # ARMA regression - elif mode == "arma": - if sm is None: - raise ImportError( - "statsmodels>=0.9.0 is required for ARMA regression. Please install this package manually or install nltools with optional arguments: pip install 'nltools[arma]'" - ) - n_jobs = kwargs.pop("n_jobs", -1) - backend = kwargs.pop("backend", "threading") - max_nbytes = kwargs.pop("max_nbytes", 1e8) - verbose = kwargs.pop("verbose", 0) - - # Parallelize if Y vector contains more than 1 column - if len(Y.shape) == 2: - if backend == "threading" and n_jobs == -1: - n_jobs = 10 - par_for = Parallel( - n_jobs=n_jobs, verbose=verbose, backend=backend, max_nbytes=max_nbytes - ) - out_arma = par_for( - delayed(_arma_func)(X, Y, idx=i, **kwargs) for i in range(Y.shape[-1]) - ) - - b = np.column_stack([elem[0] for elem in out_arma]) - t = np.column_stack([elem[1] for elem in out_arma]) - p = np.column_stack([elem[2] for elem in out_arma]) - df = np.array([elem[3] for elem in out_arma]) - res = np.column_stack([elem[4] for elem in out_arma]) - - else: - b, t, p, df, res = _arma_func(X, Y, **kwargs) - - # Arma models don't return stderr, so make a variable for consistent function - # return values - stderr = np.empty_like(b) - - return ( - b.squeeze(), - stderr.squeeze(), - t.squeeze(), - p.squeeze(), - df.squeeze(), - res.squeeze(), - ) - - def regress_permutation( X, Y, n_permute=5000, tail=2, random_state=None, verbose=False, **kwargs ): diff --git a/requirements.txt b/requirements.txt index a8d8cd85..4c22dc85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,4 @@ + burnt-ends>=0.0.2 nibabel>=3.0.1 - scikit-learn>=0.21.0 nilearn>=0.6.0 - pandas >= 1.1.0 - numpy>=1.9 - seaborn>=0.7.0 - matplotlib>=2.2.0 - scipy - pynv - joblib>=0.15 deepdish>=0.3.6