Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
334 changes: 3 additions & 331 deletions nltools/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,25 +72,15 @@
from sklearn.utils import check_random_state
from sklearn.metrics import pairwise_distances

# Stats imports from burnt-ends
from ends.stats import regress, fdr, holm_bonf, pearson2d as pearson

MAX_INT = np.iinfo(np.int32).max

# Optional dependencies
sm = attempt_to_import("statsmodels.tsa.arima_model", name="sm")


def pearson(x, y):
"""Correlates row vector x with each row vector in 2D array y.
From neurosynth.stats.py - author: Tal Yarkoni
"""
data = np.vstack((x, y))
ms = data.mean(axis=1)[(slice(None, None, None), None)]
datam = data - ms
datass = np.sqrt(np.sum(datam * datam, axis=1))
# datass = np.sqrt(ss(datam, axis=1))
temp = np.dot(datam[1:], datam[0].T)
return temp / (datass[1:] * datass[0])


def zscore(df):
"""zscore every column in a pandas dataframe or series.

Expand All @@ -109,58 +99,6 @@ def zscore(df):
raise ValueError("Data is not a Pandas DataFrame or Series instance")


def fdr(p, q=0.05):
"""Determine FDR threshold given a p value array and desired false
discovery rate q. Written by Tal Yarkoni

Args:
p: (np.array) vector of p-values
q: (float) false discovery rate level

Returns:
fdr_p: (float) p-value threshold based on independence or positive
dependence

"""

if not isinstance(p, np.ndarray):
raise ValueError("Make sure vector of p-values is a numpy array")
if any(p < 0) or any(p > 1):
raise ValueError("array contains p-values that are outside the range 0-1")

if np.any(p > 1) or np.any(p < 0):
raise ValueError("Does not include valid p-values.")

s = np.sort(p)
nvox = p.shape[0]
null = np.array(range(1, nvox + 1), dtype="float") * q / nvox
below = np.where(s <= null)[0]
return s[max(below)] if len(below) else -1


def holm_bonf(p, alpha=0.05):
"""Compute corrected p-values based on the Holm-Bonferroni method, i.e. step-down procedure applying iteratively less correction to highest p-values. A bit more conservative than fdr, but much more powerful thanvanilla bonferroni.

Args:
p: (np.array) vector of p-values
alpha: (float) alpha level

Returns:
bonf_p: (float) p-value threshold based on bonferroni
step-down procedure

"""

if not isinstance(p, np.ndarray):
raise ValueError("Make sure vector of p-values is a numpy array")

s = np.sort(p)
nvox = p.shape[0]
null = 0.05 / (nvox - np.arange(1, nvox + 1) + 1)
below = np.where(s <= null)[0]
return s[max(below)] if len(below) else -1


def threshold(stat, p, thr=0.05, return_mask=False):
"""Threshold test image by p-value from p image

Expand Down Expand Up @@ -859,67 +797,6 @@ def transform_pairwise(X, y):
return np.asarray(X_new), np.vstack((np.asarray(y_new), np.asarray(y_group))).T


def _robust_estimator(vals, X, robust_estimator="hc0", nlags=1):
"""
Computes robust sandwich estimators for standard errors used in OLS computation. Types include:
'hc0': Huber (1980) sandwich estimator to return robust standard error estimates.
'hc3': MacKinnon and White (1985) HC3 sandwich estimator. Provides more robustness in smaller samples than HC0 Long & Ervin (2000)
'hac': Newey-West (1987) estimator for robustness to heteroscedasticity as well as serial auto-correlation at given lags.

Refs: https://www.wikiwand.com/en/Heteroscedasticity-consistent_standard_errors
https://github.com/statsmodels/statsmodels/blob/master/statsmodels/regression/linear_model.py
https://cran.r-project.org/web/packages/sandwich/vignettes/sandwich.pdf
https://www.stata.com/manuals13/tsnewey.pdf

Args:
vals (np.ndarray): 1d array of residuals
X (np.ndarray): design matrix used in OLS, e.g. Brain_Data().X
robust_estimator (str): estimator type, 'hc0' (default), 'hc3', or 'hac'
nlags (int): number of lags, only used with 'hac' estimator, default is 1

Returns:
stderr (np.ndarray): 1d array of standard errors with length == X.shape[1]

"""

if robust_estimator not in ["hc0", "hc3", "hac"]:
raise ValueError("robust_estimator must be one of hc0, hc3 or hac")

# Make a sandwich!
# First we need bread
bread = np.linalg.pinv(np.dot(X.T, X))

# Then we need meat
if robust_estimator == "hc0":
V = np.diag(vals**2)
meat = np.dot(np.dot(X.T, V), X)

elif robust_estimator == "hc3":
V = np.diag(vals**2) / (1 - np.diag(np.dot(X, np.dot(bread, X.T)))) ** 2
meat = np.dot(np.dot(X.T, V), X)

elif robust_estimator == "hac":
weights = 1 - np.arange(nlags + 1.0) / (nlags + 1.0)

# First compute lag 0
V = np.diag(vals**2)
meat = weights[0] * np.dot(np.dot(X.T, V), X)

# Now loop over additional lags
for l in range(1, nlags + 1):

V = np.diag(vals[l:] * vals[:-l])
meat_1 = np.dot(np.dot(X[l:].T, V), X[:-l])
meat_2 = np.dot(np.dot(X[:-l].T, V), X[l:])

meat += weights[l] * (meat_1 + meat_2)

# Then we make a sandwich
vcv = np.dot(np.dot(bread, meat), bread)

return np.sqrt(np.diag(vcv))


def summarize_bootstrap(data, save_weights=False):
"""Calculate summary of bootstrap samples

Expand All @@ -946,211 +823,6 @@ def summarize_bootstrap(data, save_weights=False):
return output


def _arma_func(X, Y, idx=None, **kwargs):
"""
Fit an ARMA(p,q) model. If Y is a matrix and not a vector, expects an idx argument that refers to columns of Y. Used by regress().
"""
method = kwargs.pop("method", "css-mle")
order = kwargs.pop("order", (1, 1))

maxiter = kwargs.pop("maxiter", 50)
disp = kwargs.pop("disp", -1)
start_ar_lags = kwargs.pop("start_ar_lags", order[0] + 1)
transparams = kwargs.pop("transparams", False)
trend = kwargs.pop("trend", "nc")

if len(Y.shape) == 2:
model = sm.tsa.arima_model.ARMA(endog=Y[:, idx], exog=X.values, order=order)
else:
model = sm.tsa.arima_model.ARMA(endog=Y, exog=X.values, order=order)
try:
res = model.fit(
trend=trend,
method=method,
transparams=transparams,
maxiter=maxiter,
disp=disp,
start_ar_lags=start_ar_lags,
**kwargs
)
except:
res = model.fit(
trend=trend,
method=method,
transparams=transparams,
maxiter=maxiter,
disp=disp,
start_ar_lags=start_ar_lags,
start_params=np.repeat(1.0, X.shape[1] + 2),
)

return (
res.params[:-2],
res.tvalues[:-2],
res.pvalues[:-2],
res.df_resid,
res.resid,
)


def regress(X, Y, mode="ols", stats="full", **kwargs):
"""This is a flexible function to run several types of regression models provided X and Y numpy arrays. Y can be a 1d numpy array or 2d numpy array. In the latter case, results will be output with shape 1 x Y.shape[1], in other words fitting a separate regression model to each column of Y.

Does NOT add an intercept automatically to the X matrix before fitting like some other software packages. This is left up to the user.

This function can compute regression in 3 ways:

1. Standard OLS
2. OLS with robust sandwich estimators for standard errors. 3 robust types of
estimators exist:

- 'hc0' - classic huber-white estimator robust to heteroscedasticity (default)
- 'hc3' - a variant on huber-white estimator slightly more conservative when sample sizes are small
- 'hac' - an estimator robust to both heteroscedasticity and auto-correlation;
auto-correlation lag can be controlled with the `nlags` keyword argument; default
is 1

3. ARMA (auto-regressive moving-average) model (experimental). This model is fit through statsmodels.tsa.arima_model.ARMA, so more information about options can be found there. Any settings can be passed in as kwargs. By default fits a (1,1) model with starting lags of 2. This mode is **computationally intensive** and can take quite a while if Y has many columns. If Y is a 2d array joblib.Parallel is used for faster fitting by parallelizing fits across columns of Y. Parallelization can be controlled by passing in kwargs. Defaults to multi-threading using 10 separate threads, as threads don't require large arrays to be duplicated in memory. Defaults are also set to enable memory-mapping for very large arrays if backend='multiprocessing' to prevent crashes and hangs. Various levels of progress can be monitored using the 'disp' (statsmodels) and 'verbose' (joblib) keyword arguments with integer values > 0.

Args:
X (ndarray): design matrix; assumes intercept is included
Y (ndarray): dependent variable array; if 2d, a model is fit to each column of Y separately
mode (str): kind of model to fit; must be one of 'ols' (default), 'robust', or
'arma'
stats (str): one of 'full', 'betas', 'tstats'. Useful to speed up calculation if
you know you only need some statistics and not others. Defaults to 'full'.
robust_estimator (str,optional): kind of robust estimator to use if mode = 'robust'; default 'hc0'
nlags (int,optional): auto-correlation lag correction if mode = 'robust' and robust_estimator = 'hac'; default 1
order (tuple,optional): auto-regressive and moving-average orders for mode = 'arma'; default (1,1)
kwargs (dict): additional keyword arguments to statsmodels.tsa.arima_model.ARMA and joblib.Parallel

Returns:
b: coefficients
se: standard error of coefficients
t: t-statistics (coef/sterr)
p : p-values
df: degrees of freedom
res: residuals

Examples:
Standard OLS

>>> results = regress(X,Y,mode='ols')

Robust OLS with heteroscedasticity (hc0) robust standard errors

>>> results = regress(X,Y,mode='robust')

Robust OLS with heteroscedasticty and auto-correlation (with lag 2) robust standard errors

>>> results = regress(X,Y,mode='robust',robust_estimator='hac',nlags=2)

Auto-regressive mode with auto-regressive and moving-average lags = 1

>>> results = regress(X,Y,mode='arma',order=(1,1))

Auto-regressive model with auto-regressive lag = 2, moving-average lag = 3, and multi-processing instead of multi-threading using 8 cores (this can use a lot of memory if input arrays are very large!).

>>> results = regress(X,Y,mode='arma',order=(2,3),backend='multiprocessing',n_jobs=8)

"""

if not isinstance(mode, str):
raise ValueError("mode must be a string")

if not isinstance(stats, str):
raise ValueError("stats must be a string")

if mode not in ["ols", "robust", "arma"]:
raise ValueError("Mode must be one of 'ols','robust' or 'arma'")

if stats not in ["full", "betas", "tstats"]:
raise ValueError("stats must be one of 'full', 'betas', 'tstats'")

# Make sure Y is a 2-D array
if len(Y.shape) == 1:
Y = Y[:, np.newaxis]

# Compute standard errors based on regression mode
if mode == "ols" or mode == "robust":

b = np.dot(np.linalg.pinv(X), Y)

# Return betas and stop other computations if that's all that's requested
if stats == "betas":
return b.squeeze()
res = Y - np.dot(X, b)

# Vanilla OLS
if mode == "ols":
sigma = np.std(res, axis=0, ddof=X.shape[1])
stderr = (
np.sqrt(np.diag(np.linalg.pinv(np.dot(X.T, X))))[:, np.newaxis]
* sigma[np.newaxis, :]
)

# OLS with robust sandwich estimator based standard-errors
elif mode == "robust":
robust_estimator = kwargs.pop("robust_estimator", "hc0")
nlags = kwargs.pop("nlags", 1)
axis_func = [_robust_estimator, 0, res, X, robust_estimator, nlags]
stderr = np.apply_along_axis(*axis_func)

# Then only compute t-stats at voxels where the standard error is at least .000001
t = np.zeros_like(b)
t[stderr > 1.0e-6] = b[stderr > 1.0e-6] / stderr[stderr > 1.0e-6]

# Return betas and ts and stop other computations if that's all that's requested
if stats == "tstats":
return b.squeeze(), t.squeeze()
df = np.array([X.shape[0] - X.shape[1]] * t.shape[1])
p = 2 * (1 - t_dist.cdf(np.abs(t), df))

# ARMA regression
elif mode == "arma":
if sm is None:
raise ImportError(
"statsmodels>=0.9.0 is required for ARMA regression. Please install this package manually or install nltools with optional arguments: pip install 'nltools[arma]'"
)
n_jobs = kwargs.pop("n_jobs", -1)
backend = kwargs.pop("backend", "threading")
max_nbytes = kwargs.pop("max_nbytes", 1e8)
verbose = kwargs.pop("verbose", 0)

# Parallelize if Y vector contains more than 1 column
if len(Y.shape) == 2:
if backend == "threading" and n_jobs == -1:
n_jobs = 10
par_for = Parallel(
n_jobs=n_jobs, verbose=verbose, backend=backend, max_nbytes=max_nbytes
)
out_arma = par_for(
delayed(_arma_func)(X, Y, idx=i, **kwargs) for i in range(Y.shape[-1])
)

b = np.column_stack([elem[0] for elem in out_arma])
t = np.column_stack([elem[1] for elem in out_arma])
p = np.column_stack([elem[2] for elem in out_arma])
df = np.array([elem[3] for elem in out_arma])
res = np.column_stack([elem[4] for elem in out_arma])

else:
b, t, p, df, res = _arma_func(X, Y, **kwargs)

# Arma models don't return stderr, so make a variable for consistent function
# return values
stderr = np.empty_like(b)

return (
b.squeeze(),
stderr.squeeze(),
t.squeeze(),
p.squeeze(),
df.squeeze(),
res.squeeze(),
)


def regress_permutation(
X, Y, n_permute=5000, tail=2, random_state=None, verbose=False, **kwargs
):
Expand Down
9 changes: 1 addition & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
burnt-ends>=0.0.2
nibabel>=3.0.1
scikit-learn>=0.21.0
nilearn>=0.6.0
pandas >= 1.1.0
numpy>=1.9
seaborn>=0.7.0
matplotlib>=2.2.0
scipy
pynv
joblib>=0.15
deepdish>=0.3.6