#!pip install ISLP
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize, poly)
from sklearn.model_selection import train_test_split
1. imports
from functools import partial
from sklearn.model_selection import (cross_validate , KFold , ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm
2. Validation and CV
-
Auto
데이터 로딩 및 데이터 쪼개기
test_size
를 키우면 훈련 데이터가 줄어듦Validation
오차 확인
= load_data('Auto')
Auto = train_test_split(Auto, test_size=100, random_state=0)
Auto_train, Auto_valid
= MS(['horsepower'])
hp_mm = hp_mm.fit_transform(Auto_train)
X_train = Auto_train['mpg']
y_train = sm.OLS(y_train, X_train)
model = model.fit()
results print(results.summary())
= hp_mm.transform(Auto_valid)
X_valid = Auto_valid['mpg']
y_valid = results.predict(X_valid)
valid_pred = results.predict(X_train)
train_pred
print(np.mean((y_train - train_pred)**2))
print(np.mean((y_valid - valid_pred)**2))
OLS Regression Results
==============================================================================
Dep. Variable: mpg R-squared: 0.599
Model: OLS Adj. R-squared: 0.598
Method: Least Squares F-statistic: 433.9
Date: Tue, 15 Apr 2025 Prob (F-statistic): 1.48e-59
Time: 01:05:33 Log-Likelihood: -879.71
No. Observations: 292 AIC: 1763.
Df Residuals: 290 BIC: 1771.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
intercept 39.8828 0.848 47.020 0.000 38.213 41.552
horsepower -0.1587 0.008 -20.831 0.000 -0.174 -0.144
==============================================================================
Omnibus: 8.415 Durbin-Watson: 2.042
Prob(Omnibus): 0.015 Jarque-Bera (JB): 8.304
Skew: 0.397 Prob(JB): 0.0157
Kurtosis: 3.226 Cond. No. 327.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
24.228292976800535
23.20193004710425
-
훈련과 검증 오차를 확인하는 함수 생성
def evalMSE(terms, response , train , test):
= MS(terms)
mm = mm.fit_transform(train)
X_train = train[response]
y_train = mm.transform(test)
X_test = test[response]
y_test = sm.OLS(y_train, X_train).fit()
results = results.predict(X_test)
test_pred return np.mean((y_test - test_pred)**2)
-
다항모형의 차수를 바꾸면서 검증 오차를 확인(Validation error for degrees)
= np.zeros(3)
MSE for idx, degree in enumerate(range(1, 4)):
= evalMSE([poly('horsepower', degree)], 'mpg', Auto_train, Auto_valid)
MSE[idx] MSE
array([23.20193005, 16.99348987, 16.9506061 ])
-
교차 검증 오차(leave-one-out)
= sklearn_sm(sm.OLS, MS(['horsepower']))
hp_model = Auto.drop(columns=['mpg']), Auto['mpg']
X, Y = cross_validate(hp_model, X, Y, cv=Auto.shape[0])
cv_results = np.mean(cv_results['test_score'])
cv_err cv_err
24.231513517929226
-
다항식의 차수를 키웠을 때의 교차검증오차(leave-one-out)을 확인
= np.zeros(5)
cv_error = np.array(Auto['horsepower'])
H = sklearn_sm(sm.OLS)
M
for i, d in enumerate(range(1,6)):
= np.power.outer(H, np.arange(d+1))
X = cross_validate(M, X, Y, cv=Auto.shape[0])
M_CV = np.mean(M_CV['test_score'])
cv_error[i]
cv_error
array([24.23151352, 19.24821312, 19.33498406, 19.42443036, 19.03323735])
-
40 fold-CV 를 통해서 차수에 대한 예측오차를 확인함
- shupple=True : 무작위로 섞어라
- n_split=40 : 40개로 쪼개라
- 1차에서 값이 크고 2차에서 값이 뚝 떨어진다
= np.zeros(5)
cv_error # 40개로 쪼개는 CV
= KFold(n_splits=40, shuffle=True, random_state=0)
cv print(cv)
# use same splits for each degree for i, d in enumerate(range(1,6)):
for i, d in enumerate(range(1,6)):
= np.power.outer(H, np.arange(d+1))
X = cross_validate(M, X, Y, cv=cv)
M_CV = np.mean(M_CV['test_score'])
cv_error[i]
cv_errorprint(cv_error)
KFold(n_splits=40, random_state=0, shuffle=True)
[24.11505971 19.04928568 19.14454426 19.28592384 18.93589665]
-
두 개로 쪼개서 validation error를 확인
- CV를 이렇게 숫자를 지정해서 하는 방법도 있다는 정도..
= ShuffleSplit(n_splits=1, test_size=196, random_state=0)
validation = cross_validate(hp_model,
results 'mpg'], axis=1), Auto['mpg'], cv =validation)
Auto.drop([print(results)
{'fit_time': array([0.00582528]), 'score_time': array([0.00158834]), 'test_score': array([23.61661707])}
3. Bootstrap
-
\(Var ( \alpha X + ( 1- \alpha)Y)\)를 최소화하는 문제: 수식에 의해서 \(\alpha\)에 대한 해는 \[ \frac{ \sigma_Y^2 - \sigma_{XY} }{ \sigma_X^2 + \sigma_Y^2 - 2 \sigma_{XY} }\]
- X가 \(\alpha\) 만큼의 비중 Y가 \(1-\alpha\)만큼의 비중
- 분산을 줄이고싶음
-
주어진 데이터의 인뎅싱을 통해서 공분산을 얻고 해를 구함
= load_data('Portfolio')
Portfolio def alpha_func(D, idx):
= np.cov(D[['X','Y']].loc[idx], rowvar=False)
cov_ return ((cov_[1,1] - cov_[0,1]) / (cov_[0,0]+cov_[1,1]-2*cov_[0,1]))
-
위의 함수 구동
- index를 랜덤하게 정해서 결과를 확인
= np.random.default_rng()
rng = rng.choice(100, 100, replace=True)
idx = range(Portfolio.shape[0])
idx alpha_func(Portfolio, idx)
0.5758320745928298
-
붓스트랩 기법의 적용(\(\alpha\))
- \(\alpha\)에 대한 계산을 여러 개의 복원추출 샘플로 구함
- 여기 나온 여래개의 \(\alpha\)로 (추정량)\(\hat{\alpha}\)의 표준편차를 추정
def boot_SE(func, D, n=None, B=1000, seed=0):
= np.random.default_rng(seed)
rng = 0, 0
first_ , second_ = n or D.shape[0]
n for _ in range(B):
= rng.choice(D.index, n, replace=True)
idx = func(D, idx)
value += value
first_ += value**2
second_ return np.sqrt(second_ / B - (first_ / B)**2) # 표준편차 계산
= boot_SE(alpha_func, Portfolio ,B=1000, seed=1)
alpha_SE alpha_SE
0.08945522198999856
-
붓스트랩 기법의 적용(OLS)
- 복원추출로 여러 개의 샘플을 만들어 회귀계수들을 확인
- 함수를 구동할 때 모든 인자가 아니라 필요한 인자만 받을 수 있게 함
- 앞에 두 인자는 고정시키게 됨
def boot_OLS(model_matrix, response, D, idx):
= D.iloc[idx] #D.iloc #
D_ = D_[response]
Y_ = clone(model_matrix).fit_transform(D_)
X_ return sm.OLS(Y_, X_).fit().params
= partial(boot_OLS, MS(['horsepower']), 'mpg') hp_func
= np.random.default_rng(0)
rng = [hp_func(Auto, rng.choice(392, 392, replace=True)) for _ in range(10)]
kk print(np.array(kk))
[[39.88064456 -0.1567849 ]
[38.73298691 -0.14699495]
[38.31734657 -0.14442683]
[39.91446826 -0.15782234]
[39.43349349 -0.15072702]
[40.36629857 -0.15912217]
[39.62334517 -0.15449117]
[39.0580588 -0.14952908]
[38.66688437 -0.14521037]
[39.64280792 -0.15555698]]
-
붓스트랩으로 얻은 표준오차와 일반 방법론으로 얻은 표준오차 비교
def boot_OLS(model_matrix, response, D, idx):
= D.loc[idx] #D.iloc #
D_ = D_[response]
Y_ = clone(model_matrix).fit_transform(D_)
X_ return sm.OLS(Y_, X_).fit().params
= partial(boot_OLS, MS(['horsepower']), 'mpg')
hp_func = boot_SE(hp_func, Auto, B=1000, seed=0)
hp_se print(hp_se)
'mpg'])
hp_model.fit(Auto, Auto[= summarize(hp_model.results_)['std err']
model_se print(summarize(M.fit())['std err'])
intercept 0.717654
horsepower 0.006041
dtype: float64
intercept 0.717
horsepower 0.006
Name: std err, dtype: float64
= MS([poly('horsepower', 2, raw=True)])
quad_model = partial(boot_OLS, quad_model, 'mpg')
quad_func print(boot_SE(quad_func, Auto, B=1000))
= sm.OLS(Auto['mpg'], quad_model.fit_transform(Auto))
M print(summarize(M.fit())['std err'])
intercept 1.538641
poly(horsepower, degree=2, raw=True)[0] 0.024696
poly(horsepower, degree=2, raw=True)[1] 0.000090
dtype: float64
intercept 1.800
poly(horsepower, degree=2, raw=True)[0] 0.031
poly(horsepower, degree=2, raw=True)[1] 0.000
Name: std err, dtype: float64