import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
1. imports
'figure.figsize'] = (4.5, 3.0) plt.rcParams[
2. 로지스틱 -sig(linr(x))
A. 희귀모형과 로지스틱
-
모형의 비교
- 회귀모형: \(y_i \sim {\cal N}(w_0+w_1x_i, \sigma^2)\)[1]
- 로지스틱: \(y_i \sim {\cal B}\big(\frac{\exp(w_0+w_1x_i)}{1+\exp(w_0+w_1x_i)}\big)\)
-
우리가 예측하고 싶은것
- 회귀모형: 정규분포의 평균을 예측하고 싶음. 즉 \(w_0+w_1x_i\)를 예측하고 싶음. 예측값으로는 \(\hat{w}_0 + \hat{w}_1x_i\)를 사용!
- 로지스틱: 베르누이의 평균을 예측하고 싶음. 즉 \(\frac{\exp(w_0+w_1x_i)}{1+\exp(w_0+w_1x_i)}\)를 예측하고 싶음. 예측값으로는 \(\frac{\exp(\hat{w}_0+\hat{w}_1x_i)}{1+\exp(\hat{w}_0+\hat{w}_1x_i)}\)를 사용!
B. 데이터 - 스펙과 취업
43052)
torch.manual_seed(= torch.linspace(-1,1,2000).reshape(2000,1)
x = -1, 5
w0,w1 = torch.exp(w0+w1*x) / (1+torch.exp(w0+w1*x))
prob = torch.bernoulli(prob) y
'.',alpha=0.03)
plt.plot(x,y,0],y[0],'.',label=r"$(x_i,y_i)$",color="C0")
plt.plot(x['--r',label=r"prob (true, unknown) = $\frac{exp(-1+5x)}{1+exp(-1+5x)}$")
plt.plot(x,prob, plt.legend()
C. Step1 : net 설계 (모델링)
-
최초의 곡선
- 임의의 \(\hat{w_0}, \hat{w_1}\) 설정
- 초기값 \(\hat{w_0}=-0.8\), \(\hat{w_1}=-0.3\)
- 실제값 \(\hat{w_0}=-1\), \(\hat{w_1}=5\)
-
방법1 : l1, sigmoid
= torch.nn.Linear(1,1)
l1 l1(x)
tensor([[ 0.6311],
[ 0.6304],
[ 0.6297],
...,
[-0.6902],
[-0.6909],
[-0.6916]], grad_fn=<AddmmBackward0>)
= torch.tensor([[-0.3]])
l1.weight.data = torch.tensor([-0.8]) l1.bias.data
def sigmoid(x):
return torch.exp(x)/(1+torch.exp(x))
'.',alpha=0.03)
plt.plot(x,y,0],y[0],'o',label=r"$(x_i,y_i)$",color="C0")
plt.plot(x['--r',label=r"prob (true, unknown) = $\frac{exp(-1+5x)}{1+exp(-1+5x)}$")
plt.plot(x,prob,'--b', label=r"prob (estimated) = $(x_i,\hat{y}_i)$ -- first curve")
plt.plot(x,sigmoid(l1(x)).data, plt.legend()
-
방법2 : l1, a1
= torch.nn.Linear(1,1)
l1 = torch.tensor([[-0.3]])
l1.weight.data = torch.tensor([-0.8]) l1.bias.data
= torch.nn.Sigmoid() a1
-
직접 만든 함수함수와 결과 같음
sigmoid(l1(x)), a1(l1(x))
(tensor([[0.3775],
[0.3775],
[0.3774],
...,
[0.2499],
[0.2498],
[0.2497]], grad_fn=<DivBackward0>),
tensor([[0.3775],
[0.3775],
[0.3774],
...,
[0.2499],
[0.2498],
[0.2497]], grad_fn=<SigmoidBackward0>))
-
방법3 : l1, a1 \(\to\) net
- 현재 구조
\[{\bf x} \overset{l_1}{\to} {\bf u} \overset{a_1}{\to} {\bf v} = \hat{\bf y}\]
- 함수 \(l_1, a_1\) 의 합성을 하나로 묶기
\[(a_1\circ l_1)({\bf x}) := net({\bf x})\] - 한번에 이런 기능을 해주는 \(net\) 만들기
= torch.nn.Linear(1,1)
l1 = torch.tensor([[-0.3]])
l1.weight.data = torch.tensor([-0.8])
l1.bias.data = torch.nn.Sigmoid() a1
= torch.nn.Sequential(l1,a1) net
-
셋 다 같은 결과
net(x), a1(l1(x)), sigmoid(l1(x))
(tensor([[0.3775],
[0.3775],
[0.3774],
...,
[0.2499],
[0.2498],
[0.2497]], grad_fn=<SigmoidBackward0>),
tensor([[0.3775],
[0.3775],
[0.3774],
...,
[0.2499],
[0.2498],
[0.2497]], grad_fn=<SigmoidBackward0>),
tensor([[0.3775],
[0.3775],
[0.3774],
...,
[0.2499],
[0.2498],
[0.2497]], grad_fn=<DivBackward0>))
-
net 구조 살펴보기
0], net[1] net[
(Linear(in_features=1, out_features=1, bias=True), Sigmoid())
is net[0] l1
True
is net[1] a1
True
-
방법4 : net을 바로 만들기
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
)0].weight.data = torch.tensor([[-0.3]])
net[0].bias.data = torch.tensor([-0.8])
net[= net(x) yhat
net(x)
tensor([[0.3775],
[0.3775],
[0.3774],
...,
[0.2499],
[0.2498],
[0.2497]], grad_fn=<SigmoidBackward0>)
D. Step 1~4
= torch.nn.Sequential(
net =1, out_features=1),
torch.nn.Linear(in_features
torch.nn.Sigmoid()
)= net
l1, a1 = torch.tensor([[-0.3]])
l1.weight.data = torch.tensor([-0.8])
l1.bias.data = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
for epoc in range(100):
## 1
= net(x)
yhat ## 2
= torch.mean((y-yhat)**2)
loss ## 3
loss.backward()## 4
optimizr.step() optimizr.zero_grad()
'.',alpha=0.05)
plt.plot(x,y,'--r')
plt.plot(x,prob,'--b')
plt.plot(x,yhat.data,'after 100 epochs') plt.title(
Text(0.5, 1.0, 'after 100 epochs')
for epoc in range(4900):
## 1
= net(x)
yhat ## 2
= torch.mean((y-yhat)**2)
loss ## 3
loss.backward()## 4
optimizr.step() optimizr.zero_grad()
'.',alpha=0.05)
plt.plot(x,y,'--r')
plt.plot(x,prob,'--b')
plt.plot(x,yhat.data,'after 5000 epochs') plt.title(
Text(0.5, 1.0, 'after 5000 epochs')
3. 학습과정 시각화, 문제인식
A. 시각화를 위한 준비
def plot_loss(loss_fn, ax=None, Wstar=[-1,5]):
=torch.meshgrid(torch.arange(-10,3,0.1),torch.arange(-1,10,0.1),indexing='ij')
w0hat,w1hat = w0hat.reshape(-1)
w0hat = w1hat.reshape(-1)
w1hat def l(w0hat,w1hat):
= torch.exp(w0hat+w1hat*x)/(1+torch.exp(w0hat+w1hat*x))
yhat return loss_fn(yhat,y)
= list(map(l,w0hat,w1hat))
loss #---#
if ax is None:
= plt.figure()
fig = fig.add_subplot(1,1,1,projection='3d')
ax =0.001)
ax.scatter(w0hat,w1hat,loss,s20],w1hat[::20],loss[::20],s=0.1,color='C0')
ax.scatter(w0hat[::= np.array(Wstar).reshape(-1)
w0star,w1star =200,marker='*',color='red',label=f"W=[{w0star:.1f},{w1star:.1f}]")
ax.scatter(w0star,w1star,l(w0star,w1star),s#---#
= 15
ax.elev #ax.dist = -20
= 75
ax.azim
ax.legend()r'$w_0$') # x축 레이블 설정
ax.set_xlabel(r'$w_1$') # y축 레이블 설정
ax.set_ylabel(-10,-5,0]) # x축 틱 간격 설정
ax.set_xticks([-10,0,10]) # y축 틱 간격 설정 ax.set_yticks([
def _learn_and_record(net, loss_fn, optimizr):
= []
yhat_history = []
loss_history = []
What_history = []
Whatgrad_history 0].bias.data.item(), net[0].weight.data.item()])
What_history.append([net[for epoc in range(100):
## step1
= net(x)
yhat ## step2
= loss_fn(yhat,y)
loss ## step3
loss.backward() ## step4
optimizr.step()## record
if epoc % 5 ==0:
-1).data.tolist())
yhat_history.append(yhat.reshape(
loss_history.append(loss.item())0].bias.data.item(), net[0].weight.data.item()])
What_history.append([net[0].bias.grad.item(), net[0].weight.grad.item()])
Whatgrad_history.append([net[
optimizr.zero_grad()
return yhat_history, loss_history, What_history, Whatgrad_history
def show_animation(net, loss_fn, optimizr):
= _learn_and_record(net,loss_fn,optimizr)
yhat_history,loss_history,What_history,Whatgrad_history
= plt.figure(figsize=(10,5))
fig = fig.add_subplot(1, 2, 1)
ax1 = fig.add_subplot(1, 2, 2, projection='3d')
ax2 ## ax1: 왼쪽그림
=0.01)
ax1.scatter(x,y,alpha0],y[0],color='C0',label=r"observed data = $(x_i,y_i)$")
ax1.scatter(x['--',label=r"prob (true) = $(x_i,\frac{exp(-1+5x_i)}{1+exp(-1+5x_i)})$")
ax1.plot(x,prob,= ax1.plot(x,yhat_history[0],'--',label=r"prob (estimated) = $(x_i,\hat{y}_i)$")
line,
ax1.legend()## ax2: 오른쪽그림
plot_loss(loss_fn,ax2)0,0],np.array(What_history)[0,1],loss_history[0],color='blue',s=200,marker='*')
ax2.scatter(np.array(What_history)[def animate(epoc):
line.set_ydata(yhat_history[epoc])= np.array(What_history)[epoc,0]
w0hat = np.array(What_history)[epoc,1]
w1hat = np.array(Whatgrad_history)[epoc,0]
w0hatgrad = np.array(Whatgrad_history)[epoc,1]
w1hatgrad ='grey')
ax2.scatter(w0hat,w1hat,loss_history[epoc],colorf"What.grad=[{w0hatgrad:.4f},{w1hatgrad:.4f}]",y=0.8)
ax2.set_title(f"epoch={epoc*5} // What=[{w0hat:.2f},{w1hat:.2f}] // Loss={loss_fn.__class__.__name__} // Opt={optimizr.__class__.__name__}")
fig.suptitle(return line
= animation.FuncAnimation(fig, animate, frames=20)
ani
plt.close()return ani
from matplotlib import animation
"animation.html"] = "jshtml" plt.rcParams[
= torch.nn.MSELoss()
loss_fn plot_loss(loss_fn)
42)
torch.manual_seed(= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) = torch.nn.MSELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr show_animation(net,loss_fn,optimizr)
B. 좋은 초기값
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-0.8])
net[0].weight.data = torch.tensor([[-0.3]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
#show_animation(net,loss_fn,optimizr)
C. 가능성 있는 초기값
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-3.0])
net[0].weight.data = torch.tensor([[-1.0]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
#show_animation(net,loss_fn,optimizr)
D. 최악의 초기값
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-10.0])
net[0].weight.data = torch.tensor([[-1.0]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
#show_animation(net,loss_fn,optimizr)
4. 손실함수의 개선
A. BCE Loss를 사용해서 학습
-
BCE Loss
- \(loss= - \sum_{i=1}^{n} \big(y_i\log(\hat{y}_i)+(1-y_i)\log(1-\hat{y}_i)\big)\)
= torch.nn.Sequential(
net =1, out_features=1),
torch.nn.Linear(in_features
torch.nn.Sigmoid()
)= net
l1, a1 = torch.tensor([[-0.3]])
l1.weight.data = torch.tensor([-0.8])
l1.bias.data = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
for epoc in range(100):
## 1
= net(x)
yhat ## 2
#loss = torch.mean((y-yhat)**2) # loss_fn(yhat,y)
= -torch.mean(y*torch.log(yhat) + (1-y)*torch.log(1-yhat))
loss ## 3
loss.backward()## 4
optimizr.step() optimizr.zero_grad()
'.',alpha=0.05)
plt.plot(x,y,'--r')
plt.plot(x,prob,'--b')
plt.plot(x,yhat.data,'after 100 epochs') plt.title(
Text(0.5, 1.0, 'after 100 epochs')
-
BEC Loss 불러와서 쓰기
= torch.nn.Sequential(
net =1, out_features=1),
torch.nn.Linear(in_features
torch.nn.Sigmoid()
)= net
l1, a1 = torch.tensor([[-0.3]])
l1.weight.data = torch.tensor([-0.8])
l1.bias.data = torch.nn.BCELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
for epoc in range(100):
## 1
= net(x)
yhat ## 2
= loss_fn(yhat,y) # yhat부터 써야함
loss ## 3
loss.backward()## 4
optimizr.step() optimizr.zero_grad()
'.',alpha=0.05)
plt.plot(x,y,'--r')
plt.plot(x,prob,'--b')
plt.plot(x,yhat.data,'after 100 epochs') plt.title(
Text(0.5, 1.0, 'after 100 epochs')
B. Loss Function 시각화
-
MSE Loss
plot_loss(torch.nn.MSELoss())
-
BCE Loss
plot_loss(torch.nn.BCELoss())
= plt.figure()
fig = fig.add_subplot(1,2,1,projection='3d')
ax1 = fig.add_subplot(1,2,2,projection='3d')
ax2
plot_loss(torch.nn.MSELoss(),ax1) plot_loss(torch.nn.BCELoss(),ax2)
C. 좋은 초기값 비교
-
MSE Loss
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-0.8])
net[0].weight.data = torch.tensor([[-0.3]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
show_animation(net,loss_fn,optimizr)
-
BCE Loss
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-0.8])
net[0].weight.data = torch.tensor([[-0.3]])
net[= torch.nn.BCELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
show_animation(net,loss_fn,optimizr)
D. 가능성 있는 초기값
-
MSE Loss
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-3.0])
net[0].weight.data = torch.tensor([[-1.0]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
#show_animation(net,loss_fn,optimizr)
-
BCE Loss
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-3.0])
net[0].weight.data = torch.tensor([[-1.0]])
net[= torch.nn.BCELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
#show_animation(net,loss_fn,optimizr)
E. 최악의 초기값
-
MSE Loss
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-10.0])
net[0].weight.data = torch.tensor([[-1.0]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
#show_animation(net,loss_fn,optimizr)
-
BCE Loss
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-10.0])
net[0].weight.data = torch.tensor([[-1.0]])
net[= torch.nn.BCELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
#show_animation(net,loss_fn,optimizr)
5. 옵티마이저 개선
A. 좋은 초기값
-
MSE Loss + SGD
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-0.8470])
net[0].weight.data = torch.tensor([[-0.3467]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
show_animation(net,loss_fn,optimizr)
-
MSE Loss + Adam
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-0.8])
net[0].weight.data = torch.tensor([[-0.3]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.Adam(net.parameters(),lr=0.25)
optimizr #---#
show_animation(net,loss_fn,optimizr)
B. 가능성 있는 초기값
-
MSE Loss + SGD
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-3.0])
net[0].weight.data = torch.tensor([[-1.0]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.25)
optimizr #---#
#show_animation(net,loss_fn,optimizr)
-
MSE Loss + Adam
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-3.0])
net[0].weight.data = torch.tensor([[-1.0]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.Adam(net.parameters(),lr=0.25)
optimizr #---#
#show_animation(net,loss_fn,optimizr)
C. 최악의 초기값
-
MSE Loss + SGD
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-10.0])
net[0].weight.data = torch.tensor([[-1.0]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.SGD(net.parameters(),lr=0.05)
optimizr #---#
#show_animation(net,loss_fn,optimizr)
-
MSE Loss + Adam
= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
) 0].bias.data = torch.tensor([-10.0])
net[0].weight.data = torch.tensor([[-1.0]])
net[= torch.nn.MSELoss()
loss_fn = torch.optim.Adam(net.parameters(),lr=0.25)
optimizr #---#
show_animation(net,loss_fn,optimizr)
6. 로지스틱의 한계
A. 기사
- 스펙이 너무 높아도 취업이 안됨
B. 가짜데이터 (스펙의 역설)
= pd.read_csv("https://raw.githubusercontent.com/guebin/DL2025/main/posts/ironyofspec.csv")
df df
x | prob | y | |
---|---|---|---|
0 | -1.000000 | 0.000045 | 0.0 |
1 | -0.998999 | 0.000046 | 0.0 |
2 | -0.997999 | 0.000047 | 0.0 |
3 | -0.996998 | 0.000047 | 0.0 |
4 | -0.995998 | 0.000048 | 0.0 |
... | ... | ... | ... |
1995 | 0.995998 | 0.505002 | 0.0 |
1996 | 0.996998 | 0.503752 | 0.0 |
1997 | 0.997999 | 0.502501 | 0.0 |
1998 | 0.998999 | 0.501251 | 1.0 |
1999 | 1.000000 | 0.500000 | 1.0 |
2000 rows × 3 columns
= torch.tensor(df.x).float().reshape(-1,1)
x = torch.tensor(df.y).float().reshape(-1,1)
y = torch.tensor(df.prob).float().reshape(-1,1) prob
'o',alpha=0.02)
plt.plot(x,y,0],y[0],'o',label= r"observed data = $(x_i,y_i)$",color="C0")
plt.plot(x['--b',label= r"prob (true, unknown)")
plt.plot(x,prob, plt.legend()
C. 로지스틱 적합
43052)
torch.manual_seed(= torch.nn.Sequential(
net 1,1),
torch.nn.Linear(
torch.nn.Sigmoid()
)= torch.nn.BCELoss()
loss_fn = torch.optim.Adam(net.parameters())
optimizr #---#
for epoc in range(5000):
## 1
= net(x)
yhat ## 2
= loss_fn(yhat,y)
loss ## 3
loss.backward()## 4
optimizr.step() optimizr.zero_grad()
'o',alpha=0.02)
plt.plot(x,y,0],y[0],'o',label= r"observed data = $(x_i,y_i)$",color="C0")
plt.plot(x['--b',label= r"prob (true, unknown)")
plt.plot(x,prob,'--', label= r"prob (estimated) = $(x_i,\hat{y}_i)$")
plt.plot(x,net(x).data, plt.legend()
D. 로지스틱 한계극복 아이디어
-
sigmoid를 넣기 전의 상태가 직선이 아니라 꺽이는 직선이어야 함
= torch.nn.Sigmoid() a
= plt.subplots(4,2,figsize=(8,8))
fig,ax = torch.tensor([-6,-4,-2,0,2,4,6])
u1 = torch.tensor([6,4,2,0,-2,-4,-6])
u2 = torch.tensor([-6,-2,2,6,2,-2,-6])
u3 = torch.tensor([-6,-2,2,6,4,2,0])
u4 0,0].plot(u1,'--o',color='C0',label = r"$u_1$")
ax[0,0].legend()
ax[0,1].plot(a(u1),'--o',color='C0',label = r"$a(u_1)=\frac{exp(u_1)}{exp(u_1)+1}$")
ax[0,1].legend()
ax[1,0].plot(u2,'--o',color='C1',label = r"$u_2$")
ax[1,0].legend()
ax[1,1].plot(a(u2),'--o',color='C1',label = r"$a(u_2)=\frac{exp(u_2)}{exp(u_2)+1}$")
ax[1,1].legend()
ax[2,0].plot(u3,'--o',color='C2', label = r"$u_3$")
ax[2,0].legend()
ax[2,1].plot(a(u3),'--o',color='C2', label = r"$a(u_3)=\frac{exp(u_3)}{exp(u_3)+1}$")
ax[2,1].legend()
ax[3,0].plot(u4,'--o',color='C3', label = r"$u_4$")
ax[3,0].legend()
ax[3,1].plot(a(u4),'--o',color='C3', label = r"$a(u_4)=\frac{exp(u_4)}{exp(u_4)+1}$")
ax[3,1].legend() ax[