#!pip install ISLP
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize, poly)
from sklearn.model_selection import train_test_split1. imports
from functools import partial
from sklearn.model_selection import (cross_validate , KFold , ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm2. Validation and CV
- Auto 데이터 로딩 및 데이터 쪼개기
test_size를 키우면 훈련 데이터가 줄어듦Validation오차 확인
Auto = load_data('Auto')
Auto_train, Auto_valid = train_test_split(Auto, test_size=100, random_state=0)
hp_mm = MS(['horsepower'])
X_train = hp_mm.fit_transform(Auto_train)
y_train = Auto_train['mpg']
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())
X_valid = hp_mm.transform(Auto_valid)
y_valid = Auto_valid['mpg']
valid_pred = results.predict(X_valid)
train_pred = results.predict(X_train)
print(np.mean((y_train - train_pred)**2))
print(np.mean((y_valid - valid_pred)**2)) OLS Regression Results
==============================================================================
Dep. Variable: mpg R-squared: 0.599
Model: OLS Adj. R-squared: 0.598
Method: Least Squares F-statistic: 433.9
Date: Tue, 15 Apr 2025 Prob (F-statistic): 1.48e-59
Time: 01:05:33 Log-Likelihood: -879.71
No. Observations: 292 AIC: 1763.
Df Residuals: 290 BIC: 1771.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
intercept 39.8828 0.848 47.020 0.000 38.213 41.552
horsepower -0.1587 0.008 -20.831 0.000 -0.174 -0.144
==============================================================================
Omnibus: 8.415 Durbin-Watson: 2.042
Prob(Omnibus): 0.015 Jarque-Bera (JB): 8.304
Skew: 0.397 Prob(JB): 0.0157
Kurtosis: 3.226 Cond. No. 327.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
24.228292976800535
23.20193004710425
- 훈련과 검증 오차를 확인하는 함수 생성
def evalMSE(terms, response , train , test):
mm = MS(terms)
X_train = mm.fit_transform(train)
y_train = train[response]
X_test = mm.transform(test)
y_test = test[response]
results = sm.OLS(y_train, X_train).fit()
test_pred = results.predict(X_test)
return np.mean((y_test - test_pred)**2)- 다항모형의 차수를 바꾸면서 검증 오차를 확인(Validation error for degrees)
MSE = np.zeros(3)
for idx, degree in enumerate(range(1, 4)):
MSE[idx] = evalMSE([poly('horsepower', degree)], 'mpg', Auto_train, Auto_valid)
MSEarray([23.20193005, 16.99348987, 16.9506061 ])
- 교차 검증 오차(leave-one-out)
hp_model = sklearn_sm(sm.OLS, MS(['horsepower']))
X, Y = Auto.drop(columns=['mpg']), Auto['mpg']
cv_results = cross_validate(hp_model, X, Y, cv=Auto.shape[0])
cv_err = np.mean(cv_results['test_score'])
cv_err24.231513517929226
- 다항식의 차수를 키웠을 때의 교차검증오차(leave-one-out)을 확인
cv_error = np.zeros(5)
H = np.array(Auto['horsepower'])
M = sklearn_sm(sm.OLS)
for i, d in enumerate(range(1,6)):
X = np.power.outer(H, np.arange(d+1))
M_CV = cross_validate(M, X, Y, cv=Auto.shape[0])
cv_error[i] = np.mean(M_CV['test_score'])
cv_errorarray([24.23151352, 19.24821312, 19.33498406, 19.42443036, 19.03323735])
- 40 fold-CV 를 통해서 차수에 대한 예측오차를 확인함
- shupple=True : 무작위로 섞어라
- n_split=40 : 40개로 쪼개라
- 1차에서 값이 크고 2차에서 값이 뚝 떨어진다
cv_error = np.zeros(5)
# 40개로 쪼개는 CV
cv = KFold(n_splits=40, shuffle=True, random_state=0)
print(cv)
# use same splits for each degree for i, d in enumerate(range(1,6)):
for i, d in enumerate(range(1,6)):
X = np.power.outer(H, np.arange(d+1))
M_CV = cross_validate(M, X, Y, cv=cv)
cv_error[i] = np.mean(M_CV['test_score'])
cv_error
print(cv_error)KFold(n_splits=40, random_state=0, shuffle=True)
[24.11505971 19.04928568 19.14454426 19.28592384 18.93589665]
- 두 개로 쪼개서 validation error를 확인
- CV를 이렇게 숫자를 지정해서 하는 방법도 있다는 정도..
validation = ShuffleSplit(n_splits=1, test_size=196, random_state=0)
results = cross_validate(hp_model,
Auto.drop(['mpg'], axis=1), Auto['mpg'], cv =validation)
print(results){'fit_time': array([0.00582528]), 'score_time': array([0.00158834]), 'test_score': array([23.61661707])}
3. Bootstrap
- \(Var ( \alpha X + ( 1- \alpha)Y)\)를 최소화하는 문제: 수식에 의해서 \(\alpha\)에 대한 해는 \[ \frac{ \sigma_Y^2 - \sigma_{XY} }{ \sigma_X^2 + \sigma_Y^2 - 2 \sigma_{XY} }\]
- X가 \(\alpha\) 만큼의 비중 Y가 \(1-\alpha\)만큼의 비중
- 분산을 줄이고싶음
- 주어진 데이터의 인뎅싱을 통해서 공분산을 얻고 해를 구함
Portfolio = load_data('Portfolio')
def alpha_func(D, idx):
cov_ = np.cov(D[['X','Y']].loc[idx], rowvar=False)
return ((cov_[1,1] - cov_[0,1]) / (cov_[0,0]+cov_[1,1]-2*cov_[0,1]))- 위의 함수 구동
- index를 랜덤하게 정해서 결과를 확인
rng = np.random.default_rng()
idx = rng.choice(100, 100, replace=True)
idx = range(Portfolio.shape[0])
alpha_func(Portfolio, idx)0.5758320745928298
- 붓스트랩 기법의 적용(\(\alpha\))
- \(\alpha\)에 대한 계산을 여러 개의 복원추출 샘플로 구함
- 여기 나온 여래개의 \(\alpha\)로 (추정량)\(\hat{\alpha}\)의 표준편차를 추정
def boot_SE(func, D, n=None, B=1000, seed=0):
rng = np.random.default_rng(seed)
first_ , second_ = 0, 0
n = n or D.shape[0]
for _ in range(B):
idx = rng.choice(D.index, n, replace=True)
value = func(D, idx)
first_ += value
second_ += value**2
return np.sqrt(second_ / B - (first_ / B)**2) # 표준편차 계산
alpha_SE = boot_SE(alpha_func, Portfolio ,B=1000, seed=1)
alpha_SE0.08945522198999856
- 붓스트랩 기법의 적용(OLS)
- 복원추출로 여러 개의 샘플을 만들어 회귀계수들을 확인
- 함수를 구동할 때 모든 인자가 아니라 필요한 인자만 받을 수 있게 함
- 앞에 두 인자는 고정시키게 됨
def boot_OLS(model_matrix, response, D, idx):
D_ = D.iloc[idx] #D.iloc #
Y_ = D_[response]
X_ = clone(model_matrix).fit_transform(D_)
return sm.OLS(Y_, X_).fit().params
hp_func = partial(boot_OLS, MS(['horsepower']), 'mpg')rng = np.random.default_rng(0)
kk = [hp_func(Auto, rng.choice(392, 392, replace=True)) for _ in range(10)]
print(np.array(kk))[[39.88064456 -0.1567849 ]
[38.73298691 -0.14699495]
[38.31734657 -0.14442683]
[39.91446826 -0.15782234]
[39.43349349 -0.15072702]
[40.36629857 -0.15912217]
[39.62334517 -0.15449117]
[39.0580588 -0.14952908]
[38.66688437 -0.14521037]
[39.64280792 -0.15555698]]
- 붓스트랩으로 얻은 표준오차와 일반 방법론으로 얻은 표준오차 비교
def boot_OLS(model_matrix, response, D, idx):
D_ = D.loc[idx] #D.iloc #
Y_ = D_[response]
X_ = clone(model_matrix).fit_transform(D_)
return sm.OLS(Y_, X_).fit().params
hp_func = partial(boot_OLS, MS(['horsepower']), 'mpg')
hp_se = boot_SE(hp_func, Auto, B=1000, seed=0)
print(hp_se)
hp_model.fit(Auto, Auto['mpg'])
model_se = summarize(hp_model.results_)['std err']
print(summarize(M.fit())['std err'])intercept 0.717654
horsepower 0.006041
dtype: float64
intercept 0.717
horsepower 0.006
Name: std err, dtype: float64
quad_model = MS([poly('horsepower', 2, raw=True)])
quad_func = partial(boot_OLS, quad_model, 'mpg')
print(boot_SE(quad_func, Auto, B=1000))
M = sm.OLS(Auto['mpg'], quad_model.fit_transform(Auto))
print(summarize(M.fit())['std err'])intercept 1.538641
poly(horsepower, degree=2, raw=True)[0] 0.024696
poly(horsepower, degree=2, raw=True)[1] 0.000090
dtype: float64
intercept 1.800
poly(horsepower, degree=2, raw=True)[0] 0.031
poly(horsepower, degree=2, raw=True)[1] 0.000
Name: std err, dtype: float64