5. Sampling & Resampling

Author

이상민

Published

April 15, 2025

1. imports

#!pip install ISLP
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize, poly)

from sklearn.model_selection import train_test_split
from functools import partial
from sklearn.model_selection import (cross_validate , KFold , ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

2. Validation and CV

- Auto 데이터 로딩 및 데이터 쪼개기

  • test_size를 키우면 훈련 데이터가 줄어듦
  • Validation 오차 확인
Auto = load_data('Auto')
Auto_train, Auto_valid = train_test_split(Auto, test_size=100, random_state=0)

hp_mm = MS(['horsepower'])
X_train = hp_mm.fit_transform(Auto_train)
y_train = Auto_train['mpg']
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

X_valid = hp_mm.transform(Auto_valid)
y_valid = Auto_valid['mpg']
valid_pred = results.predict(X_valid)
train_pred = results.predict(X_train)

print(np.mean((y_train - train_pred)**2))
print(np.mean((y_valid - valid_pred)**2))
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.599
Model:                            OLS   Adj. R-squared:                  0.598
Method:                 Least Squares   F-statistic:                     433.9
Date:                Tue, 15 Apr 2025   Prob (F-statistic):           1.48e-59
Time:                        01:05:33   Log-Likelihood:                -879.71
No. Observations:                 292   AIC:                             1763.
Df Residuals:                     290   BIC:                             1771.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
intercept     39.8828      0.848     47.020      0.000      38.213      41.552
horsepower    -0.1587      0.008    -20.831      0.000      -0.174      -0.144
==============================================================================
Omnibus:                        8.415   Durbin-Watson:                   2.042
Prob(Omnibus):                  0.015   Jarque-Bera (JB):                8.304
Skew:                           0.397   Prob(JB):                       0.0157
Kurtosis:                       3.226   Cond. No.                         327.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
24.228292976800535
23.20193004710425

- 훈련과 검증 오차를 확인하는 함수 생성

def evalMSE(terms, response , train , test):
    mm = MS(terms)
    X_train = mm.fit_transform(train)
    y_train = train[response]
    X_test = mm.transform(test)
    y_test = test[response]
    results = sm.OLS(y_train, X_train).fit()
    test_pred = results.predict(X_test)
    return np.mean((y_test - test_pred)**2)

- 다항모형의 차수를 바꾸면서 검증 오차를 확인(Validation error for degrees)

MSE = np.zeros(3)
for idx, degree in enumerate(range(1, 4)):
    MSE[idx] = evalMSE([poly('horsepower', degree)], 'mpg', Auto_train, Auto_valid)
MSE
array([23.20193005, 16.99348987, 16.9506061 ])

- 교차 검증 오차(leave-one-out)

hp_model = sklearn_sm(sm.OLS, MS(['horsepower']))
X, Y = Auto.drop(columns=['mpg']), Auto['mpg']
cv_results = cross_validate(hp_model, X, Y, cv=Auto.shape[0])
cv_err = np.mean(cv_results['test_score'])
cv_err
24.231513517929226

- 다항식의 차수를 키웠을 때의 교차검증오차(leave-one-out)을 확인

cv_error = np.zeros(5)
H = np.array(Auto['horsepower'])
M = sklearn_sm(sm.OLS)

for i, d in enumerate(range(1,6)):
  X = np.power.outer(H, np.arange(d+1))
  M_CV = cross_validate(M, X, Y, cv=Auto.shape[0])
  cv_error[i] = np.mean(M_CV['test_score'])

cv_error
array([24.23151352, 19.24821312, 19.33498406, 19.42443036, 19.03323735])

- 40 fold-CV 를 통해서 차수에 대한 예측오차를 확인함

  • shupple=True : 무작위로 섞어라
  • n_split=40 : 40개로 쪼개라
  • 1차에서 값이 크고 2차에서 값이 뚝 떨어진다
cv_error = np.zeros(5)
# 40개로 쪼개는 CV
cv = KFold(n_splits=40, shuffle=True, random_state=0)
print(cv)

# use same splits for each degree for i, d in enumerate(range(1,6)):
for i, d in enumerate(range(1,6)):
    X = np.power.outer(H, np.arange(d+1))
    M_CV = cross_validate(M, X, Y, cv=cv)
    cv_error[i] = np.mean(M_CV['test_score'])
cv_error
print(cv_error)
KFold(n_splits=40, random_state=0, shuffle=True)
[24.11505971 19.04928568 19.14454426 19.28592384 18.93589665]

- 두 개로 쪼개서 validation error를 확인

  • CV를 이렇게 숫자를 지정해서 하는 방법도 있다는 정도..
validation = ShuffleSplit(n_splits=1, test_size=196, random_state=0)
results = cross_validate(hp_model,
Auto.drop(['mpg'], axis=1), Auto['mpg'], cv =validation)
print(results)
{'fit_time': array([0.00582528]), 'score_time': array([0.00158834]), 'test_score': array([23.61661707])}

3. Bootstrap

- \(Var ( \alpha X + ( 1- \alpha)Y)\)를 최소화하는 문제: 수식에 의해서 \(\alpha\)에 대한 해는 \[ \frac{ \sigma_Y^2 - \sigma_{XY} }{ \sigma_X^2 + \sigma_Y^2 - 2 \sigma_{XY} }\]

  • X가 \(\alpha\) 만큼의 비중 Y가 \(1-\alpha\)만큼의 비중
  • 분산을 줄이고싶음

- 주어진 데이터의 인뎅싱을 통해서 공분산을 얻고 해를 구함

Portfolio = load_data('Portfolio')
def alpha_func(D, idx):
    cov_ = np.cov(D[['X','Y']].loc[idx], rowvar=False)
    return ((cov_[1,1] - cov_[0,1]) / (cov_[0,0]+cov_[1,1]-2*cov_[0,1]))

- 위의 함수 구동

  • index를 랜덤하게 정해서 결과를 확인
rng = np.random.default_rng()
idx = rng.choice(100, 100, replace=True)
idx = range(Portfolio.shape[0])
alpha_func(Portfolio, idx)
0.5758320745928298

- 붓스트랩 기법의 적용(\(\alpha\))

  • \(\alpha\)에 대한 계산을 여러 개의 복원추출 샘플로 구함
  • 여기 나온 여래개의 \(\alpha\)로 (추정량)\(\hat{\alpha}\)의 표준편차를 추정
def boot_SE(func, D, n=None, B=1000, seed=0):
    rng = np.random.default_rng(seed)
    first_ , second_ = 0, 0
    n = n or D.shape[0]
    for _ in range(B):
        idx = rng.choice(D.index, n, replace=True)
        value = func(D, idx)
        first_ += value
        second_ += value**2
    return np.sqrt(second_ / B - (first_ / B)**2) # 표준편차 계산

alpha_SE = boot_SE(alpha_func, Portfolio ,B=1000, seed=1)
alpha_SE
0.08945522198999856

- 붓스트랩 기법의 적용(OLS)

  • 복원추출로 여러 개의 샘플을 만들어 회귀계수들을 확인
  • 함수를 구동할 때 모든 인자가 아니라 필요한 인자만 받을 수 있게 함
  • 앞에 두 인자는 고정시키게 됨
def boot_OLS(model_matrix, response, D, idx):
   D_ = D.iloc[idx]     #D.iloc #
   Y_ = D_[response]
   X_ = clone(model_matrix).fit_transform(D_)
   return sm.OLS(Y_, X_).fit().params

hp_func = partial(boot_OLS, MS(['horsepower']), 'mpg')
rng = np.random.default_rng(0)
kk = [hp_func(Auto, rng.choice(392, 392, replace=True)) for _ in range(10)]
print(np.array(kk))
[[39.88064456 -0.1567849 ]
 [38.73298691 -0.14699495]
 [38.31734657 -0.14442683]
 [39.91446826 -0.15782234]
 [39.43349349 -0.15072702]
 [40.36629857 -0.15912217]
 [39.62334517 -0.15449117]
 [39.0580588  -0.14952908]
 [38.66688437 -0.14521037]
 [39.64280792 -0.15555698]]

- 붓스트랩으로 얻은 표준오차와 일반 방법론으로 얻은 표준오차 비교

def boot_OLS(model_matrix, response, D, idx):
   D_ = D.loc[idx]     #D.iloc #
   Y_ = D_[response]
   X_ = clone(model_matrix).fit_transform(D_)
   return sm.OLS(Y_, X_).fit().params

hp_func = partial(boot_OLS, MS(['horsepower']), 'mpg')
hp_se = boot_SE(hp_func, Auto, B=1000, seed=0)
print(hp_se)

hp_model.fit(Auto, Auto['mpg'])
model_se = summarize(hp_model.results_)['std err']
print(summarize(M.fit())['std err'])
intercept     0.717654
horsepower    0.006041
dtype: float64
intercept     0.717
horsepower    0.006
Name: std err, dtype: float64
quad_model = MS([poly('horsepower', 2, raw=True)])
quad_func = partial(boot_OLS, quad_model, 'mpg')
print(boot_SE(quad_func, Auto, B=1000))

M = sm.OLS(Auto['mpg'], quad_model.fit_transform(Auto))
print(summarize(M.fit())['std err'])
intercept                                  1.538641
poly(horsepower, degree=2, raw=True)[0]    0.024696
poly(horsepower, degree=2, raw=True)[1]    0.000090
dtype: float64
intercept                                  1.800
poly(horsepower, degree=2, raw=True)[0]    0.031
poly(horsepower, degree=2, raw=True)[1]    0.000
Name: std err, dtype: float64