import numpy as np
from scipy import optimize
import matplotlib.pyplot as plt
from scipy.special import expit as sigmoid
from scipy import stats
from scipy.integrate import simpson
Sur-Candes phase transition in logistic regression
Consider a logistic regression estimate \[ \hat\beta = \operatorname{argmin}_{b\in \mathbb R^p} \sum_{i=1}^n \log(1+\exp(-y_i x_i^T b)) \] where \(y_i\in\{-1,1\}\) are binary responses and \(x_i\in \mathbb R^p\) are feature vectors of dimension \(p\). If a binary logistic model holds, of the form \[x_i\sim N(0_p,I_p), \qquad P(y_i=1|x_i)=\frac{1}{1+\exp(-y_ix_i^T\beta^*)}\] for some ground truth \(\beta^*\in\mathbb R^p\) with \(\|\beta^*\|=\gamma\), Candès and Sur (2020) predicts the above optimization problem admits a solution if \[ \kappa < h(\gamma) = \min_{t\in \mathbb R} \mathbb E\Big[\max(0,t y_ix_i^T\beta^* + Z)^2\Big] \] where \(n,p\to+\infty\) simultaneously with limiting ratio \(\kappa=\lim(p/n)\) and \(Z\sim N(0,1)\) is independent of everything else; while if \[ \kappa > h(\gamma) \] then no minimizer exists. The expectation on the right-hand side, as well as its gradient with respect to \(t\mathbb R\), can be computed by numerical integration.
Solving the minimization using numerical integration
= 60
grid_integration_size = 0.01 * (1.1 ** np.array(range(100))).reshape((-1, 1, 1, 1))
gamma = np.linspace(-6,6, num=grid_integration_size).reshape((1, -1, 1, 1))
Z = np.linspace(-6,6, num=grid_integration_size + 1).reshape((1, 1, -1, 1))
X = np.array([1,-1]).reshape((1, 1, 1, 2))
Y = sigmoid(X*Y*gamma)
conditional_pmf_Y
def E(x, keepdims=False):
assert len(x.shape) == 4
= x * stats.norm.pdf(Z) * conditional_pmf_Y * stats.norm.pdf(X)
x = np.sum(x, axis=3)
x = simpson(x, x=X.squeeze(), axis=2)
x = simpson(x, x=Z.squeeze(), axis=1)
x if keepdims:
return x[..., np.newaxis, np.newaxis, np.newaxis]
return x
Once we are able to integrate with respect to the joint distribution of \((Z,X,Y)\), we simply perform a gradient descent to find the minimum over \(t\). The gradient of \(t\mapsto \mathbb E\big[\max(0,t y_ix_i^T\beta^* + Z)^2\big]\) is given by \[ 2 \mathbb E[ y_ix_i^T\beta^* \max(0,t y_ix_i^T\beta^* + Z) ]. \] A simple gradient descent finds the scalar \(t\) achieving the minimum in the definition of \(h(\gamma)\).
# initialization
= 10 * np.ones_like(gamma)
t # gradient descent
for _ in range(500):
+= - 2 * E(Y * X * np.clip(t * Y * X - Z, a_min=0, a_max=None), keepdims=True)
t
= E(np.clip(t * Y * X - Z, a_min=0, a_max=None)**2) kappa
=r"curve $\kappa=h(\gamma)$")
plt.plot(kappa.squeeze(), gamma.squeeze(), labelr"$\kappa$")
plt.xlabel(r"$\gamma$")
plt.ylabel(
plt.legend()0, 15) plt.ylim(
Simulation experiment
We now verify this phase transition empirically by verifying whether \[ \hat\beta = \operatorname{argmin}_{b\in \mathbb R^p} \sum_{i=1}^n \log(1+\exp(-y_i x_i^T b)) \] has a solution. The code below runs an iterative algorithm to solve the above optimization problem. Either the algorithm converges, or the the algorithm reaches the maximal number of iterations and returns a direction \(\hat b\) that perfectly separates the data, in the sense that \(\min_{i\in[n]} y_i \hat y_i \ge 0\) where \(\hat y_i = x_i\hat b\) for each observation \(i\in[n]\).
from sklearn.linear_model import LogisticRegression
def mle_exists(n, p, gamma, seed=2024):
= LogisticRegression(penalty=None, fit_intercept=False)
regr = gamma * np.ones(p)/np.sqrt(p)
beta = np.random.default_rng(seed)
rng = rng.standard_normal(size=(n, p))
X = - (-1) ** rng.binomial(1, sigmoid(X.dot(beta)))
y
regr.fit(X, y)= X @ regr.coef_.reshape((p, ))
y_hat return {'p': p, 'n': n, 'p':p, 'gamma': gamma, 'seed': seed, 'kappa': p/n,
'MLE exists': not min(y * y_hat) >= 0.0}
1000, 200, 1.3) mle_exists(
{'p': 200,
'n': 1000,
'gamma': 1.3,
'seed': 2024,
'kappa': 0.2,
'MLE exists': True}
We now experiment with \(n=1000\), dimension \(p\) ranging from 1 to \(n/2+50\), and \(\gamma\) parameters equispaced from 0 to 12. For each \((n,p,\gamma)\), the dataset is generated 10 times.
= 1000
n = [{'n': 1000, 'p': p, 'gamma': g, 'seed': seed}
parameters for p in range(1, n//2 + 50, 5)
for g in np.linspace(0, 12, num=100)
for seed in range(10)]
from joblib import Parallel, delayed
f"Will now launch {len(parameters)} parallel tasks"
'Will now launch 110000 parallel tasks'
# Heavy parallel computation
= Parallel(n_jobs=18, verbose=2)(delayed(mle_exists)(**d) for d in parameters) data
Visualizing the phase transition: theory versus empirical probabilities
Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
'figure.dpi'] = 150
mpl.rcParams[
= plt.subplots(1, 2, gridspec_kw={'width_ratios': [1.2, 1]})
fig, (ax1, ax2) 15, 5.5)
fig.set_size_inches(
= pd.pivot_table(pd.DataFrame(data),
piv ='MLE exists',
values='gamma',
index='kappa')
columns='kappa', y='gamma', c=0, kind='scatter', ax=ax1)
piv.unstack().reset_index().plot(x='red', label='theory', linewidth=3.0)
ax1.plot(kappa.squeeze(), gamma.squeeze(), color'kappa = dimension / sample size')
plt.xlabel(r'gamma = $\sqrt{Var[x_i^T \beta^*]}$')
plt.ylabel(
ax1.legend()
='red')
ax2.plot(kappa.squeeze(), gamma.squeeze(), color="yellowgreen")
ax2.fill_between(kappa.squeeze(), gamma.squeeze(), color= ax2.get_ylim()[1]
maxy ="lightcoral")
ax2.fill_between(kappa.squeeze(), gamma.squeeze(), maxy, color0.5, 0.6), np.zeros(50), maxy, color="lightcoral")
ax2.fill_between(np.linspace(
0.1, 2.0 ,"logistic MLE exists")
ax2.text(0.25, 5.0 ,"logistic MLE does not exist")
ax2.text(
for ax in [ax1, ax2]:
r'gamma = $\sqrt{Var[x_i^T \beta^*]}$')
ax.set_ylabel('kappa = dimension / sample size')
ax.set_xlabel(0.02, 0.55)
ax.set_xlim(0, 12)
ax.set_ylim(=0, y=0) ax.margins(x