import numpy as np, pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (summarize, poly, ModelSpec as MS)
from statsmodels.stats.anova import anova_lm
1. imports
from pygam import (s as s_gam, l as l_gam, f as f_gam, LinearGAM, LogisticGAM)
from ISLP.transforms import (BSpline, NaturalSpline)
from ISLP.models import bs, ns
from ISLP.pygam import (approx_lam, degrees_of_freedom, plot as plot_gam, anova as anova_gam)
2. Wage 데이터 분석
= load_data('Wage')
Wage = Wage['wage']
y = Wage['age'] age
-
4차 다항식(\(x,x^2,x^3,x^4\))으로 분석
# 4차 다항식의 변수확장을 위한 전 처리
# ------------------------------------
= MS([poly('age', degree=4)]).fit(Wage)
poly_age = sm.OLS(y, poly_age.transform(Wage)).fit()
M
summarize(M)
= np.linspace(age.min(), age.max(),100)
age_grid = pd.DataFrame({'age': age_grid}) age_df
# 4차 다항식의 적합 결과를 그리기 위한 함수를 만들고 적합해서 결과를 확인함
# -------------------------------------------------------------------------
def plot_wage_fit(age_df, basis, title):
= basis.transform(Wage)
X = basis.transform(age_df)
Xnew = sm.OLS(y, X).fit()
M = M.get_prediction(Xnew)
preds = preds.conf_int(alpha=0.05)
bands = subplots(figsize=(8,8))
fig , ax ='gray', alpha=0.5)
ax.scatter(age, y, facecolorfor val, ls in zip([preds.predicted_mean, bands[:,0], bands[:,1]],['b','r--','r--']):
=3)
ax.plot(age_df.values , val , ls, linewidth=20)
ax.set_title(title , fontsize 'Age', fontsize=20)
ax.set_xlabel('Wage', fontsize=20);
ax.set_ylabel(return ax
# 가운데 선은 평균, 위 아래 선들은 95% 수준의 (평균에 대한) 신뢰구간들임
# ---------------------------------------------------------------------
'Degree -4 Polynomial'); plot_wage_fit(age_df, poly_age,
-
다항식의 차수를 1차에서 5차까지 조정해보며 확인
#위 결과는 위에 있는 것임
summarize(M) = [MS([poly('age', degree=d)]) for d in range(1, 6)]
models = [model.fit_transform(Wage) for model in models]
Xs *[sm.OLS(y, X_).fit() for X_ in Xs]) anova_lm(
df_resid | ssr | df_diff | ss_diff | F | Pr(>F) | |
---|---|---|---|---|---|---|
0 | 2998.0 | 5.022216e+06 | 0.0 | NaN | NaN | NaN |
1 | 2997.0 | 4.793430e+06 | 1.0 | 228786.010128 | 143.593107 | 2.363850e-32 |
2 | 2996.0 | 4.777674e+06 | 1.0 | 15755.693664 | 9.888756 | 1.679202e-03 |
3 | 2995.0 | 4.771604e+06 | 1.0 | 6070.152124 | 3.809813 | 5.104620e-02 |
4 | 2994.0 | 4.770322e+06 | 1.0 | 1282.563017 | 0.804976 | 3.696820e-01 |
= [MS(['education', poly('age', degree=d)]) for d in range(1, 4)]
models = [model.fit_transform(Wage) for model in models]
XEs *[sm.OLS(y, X_).fit() for X_ in XEs]) anova_lm(
df_resid | ssr | df_diff | ss_diff | F | Pr(>F) | |
---|---|---|---|---|---|---|
0 | 2997.0 | 3.902335e+06 | 0.0 | NaN | NaN | NaN |
1 | 2996.0 | 3.759472e+06 | 1.0 | 142862.701185 | 113.991883 | 3.838075e-26 |
2 | 2995.0 | 3.753546e+06 | 1.0 | 5926.207070 | 4.728593 | 2.974318e-02 |
= poly_age.transform(Wage)
X = Wage['high_earn'] = y > 250 # shorthand
high_earn = sm.GLM(y > 250, X, family=sm.families.Binomial())
glm = glm.fit()
B
summarize(B)# logistic regression
coef | std err | z | P>|z| | |
---|---|---|---|---|
intercept | -4.3012 | 0.345 | -12.457 | 0.000 |
poly(age, degree=4)[0] | 71.9642 | 26.133 | 2.754 | 0.006 |
poly(age, degree=4)[1] | -85.7729 | 35.929 | -2.387 | 0.017 |
poly(age, degree=4)[2] | 34.1626 | 19.697 | 1.734 | 0.083 |
poly(age, degree=4)[3] | -47.4008 | 24.105 | -1.966 | 0.049 |
= poly_age.transform(age_df)
newX = B.get_prediction(newX)
preds = preds.conf_int(alpha=0.05)
bands
= subplots(figsize=(8,8))
fig , ax = np.random.default_rng(0)
rng + 0.2 * rng.uniform(size=y.shape[0]), np.where(high_earn , 0.198, 0.002),
ax.scatter(age ='gray', marker='|')
fcfor val , ls in zip([preds.predicted_mean, bands[:,0], bands[:,1]], ['b','r--','r--']):
=3)
ax.plot(age_df.values , val , ls, linewidth'Degree -4 Polynomial', fontsize=20)
ax.set_title('Age', fontsize=20)
ax.set_xlabel(0,0.2])
ax.set_ylim (['P(Wage > 250)', fontsize=20); ax.set_ylabel(
= pd.qcut(age , 4)
cut_age summarize(sm.OLS(y, pd.get_dummies(cut_age)).fit())
coef | std err | t | P>|t| | |
---|---|---|---|---|
(17.999, 33.75] | 94.1584 | 1.478 | 63.692 | 0.0 |
(33.75, 42.0] | 116.6608 | 1.470 | 79.385 | 0.0 |
(42.0, 51.0] | 119.1887 | 1.416 | 84.147 | 0.0 |
(51.0, 80.0] | 116.5717 | 1.559 | 74.751 | 0.0 |
-
age 변수에 대한 B-spline basis function을 생성하고 OLS를 적합해 봄
- 안쪽 낫의 개수, 그리고 다항 차수에 따라 전체 basis 개수가 결정됨
= BSpline(internal_knots=[25,40,60], intercept=True).fit(age)
bs_ = bs_.transform(age)
bs_age
bs_age.shape
= MS([bs('age', internal_knots=[25,40,60])])
bs_age = bs_age.fit_transform(Wage)
Xbs = sm.OLS(y, Xbs).fit()
M
summarize(M)
= MS([bs('age', internal_knots=[25,40,60], name='bs(age)')])
bs_age = bs_age.fit_transform(Wage)
Xbs = sm.OLS(y, Xbs).fit()
M summarize(M)
coef | std err | t | P>|t| | |
---|---|---|---|---|
intercept | 60.4937 | 9.460 | 6.394 | 0.000 |
bs(age)[0] | 3.9805 | 12.538 | 0.317 | 0.751 |
bs(age)[1] | 44.6310 | 9.626 | 4.636 | 0.000 |
bs(age)[2] | 62.8388 | 10.755 | 5.843 | 0.000 |
bs(age)[3] | 55.9908 | 10.706 | 5.230 | 0.000 |
bs(age)[4] | 50.6881 | 14.402 | 3.520 | 0.000 |
bs(age)[5] | 16.6061 | 19.126 | 0.868 | 0.385 |
-
자유도 6은 basis function들 6개를 의미
=6).fit(age).internal_knots_ BSpline(df
array([33.75, 42. , 51. ])
-
자유도를 3으로하여 분석
= MS([bs('age', df=3, degree=0)]).fit(Wage)
bs_age0 = bs_age0.transform(Wage)
Xbs0 summarize(sm.OLS(y, Xbs0).fit())
coef | std err | t | P>|t| | |
---|---|---|---|---|
intercept | 94.1584 | 1.478 | 63.687 | 0.0 |
bs(age, df=3, degree=0)[0] | 22.3490 | 2.152 | 10.388 | 0.0 |
bs(age, df=3, degree=0)[1] | 24.8076 | 2.044 | 12.137 | 0.0 |
bs(age, df=3, degree=0)[2] | 22.7814 | 2.087 | 10.917 | 0.0 |
-
Natural Cubic Spline : 3차 다항에 선형을 결합
= MS([ns('age', df=5)]).fit(Wage)
ns_age = sm.OLS(y, ns_age.transform(Wage)).fit()
M_ns
summarize(M_ns)'Natural spline, df=5'); plot_wage_fit(age_df, ns_age,
-
Smoothing spline
= np.asarray(age).reshape((-1,1))
X_age = LinearGAM(s_gam(0, lam=0.6))
gam gam.fit(X_age , y)
LinearGAM(callbacks=[Deviance(), Diffs()], fit_intercept=True,
max_iter=100, scale=None, terms=s(0) + intercept, tol=0.0001,
verbose=False)
= subplots(figsize=(8,8))
fig , ax ='gray', alpha=0.5)
ax.scatter(age , y, facecolorfor lam in np.logspace(-2, 6, 5):
= LinearGAM(s_gam(0, lam=lam)).fit(X_age , y)
gam ='{:.1e}'.format(lam), linewidth=3)
ax.plot(age_grid, gam.predict(age_grid), label'Age', fontsize=20)
ax.set_xlabel('Wage', fontsize=20);
ax.set_ylabel(='$\lambda$'); ax.legend(title
= gam.gridsearch(X_age , y)
gam_opt ='Grid search', linewidth=10)
ax.plot(age_grid, gam_opt.predict(age_grid), label
ax.legend() fig
100% (11 of 11) |########################| Elapsed Time: 0:00:00 Time: 0:00:000:00