import gymnasium as gym
#---#
import numpy as np
import collections
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import IPython
1. imports
2. Bandit 환경 설계 및 풀이
A. 간단히 개념만 실습
= [0,1]
action_space = collections.deque(maxlen=500)
actions_deque = collections.deque(maxlen=500)
rewards_deque #---#
for _ in range(10):
= np.random.choice(action_space)
action if action == 1:
= 10
reward else:
= 1
reward
actions_deque.append(action) rewards_deque.append(reward)
actions_deque
deque([0, 0, 1, 1, 1, 1, 1, 1, 1, 0], maxlen=500)
rewards_deque
deque([1, 1, 10, 10, 10, 10, 10, 10, 10, 1], maxlen=500)
= np.array(actions_deque)
actions_numpy = np.array(rewards_deque) rewards_numpy
= rewards_numpy[actions_numpy == 0].mean()
q0 = rewards_numpy[actions_numpy == 1].mean()
q1 = np.array([q0,q1])
q_table q_table
array([ 1., 10.])
= q_table.argmax() action
for _ in range(5):
= q_table.argmax()
action if action == 1:
= 10
reward else:
= 1
reward
actions_deque.append(action)
rewards_deque.append(reward)= np.array(actions_deque)
actions_numpy = np.array(rewards_deque)
rewards_numpy = rewards_numpy[actions_numpy == 0].mean()
q0 = rewards_numpy[actions_numpy == 1].mean()
q1 = np.array([q0,q1]) q_table
actions_numpy
array([0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])
rewards_numpy
array([ 1, 1, 10, 10, 10, 10, 10, 10, 10, 1, 10, 10, 10, 10, 10])
B. 클래스를 이용한 구현
class Bandit:
def __init__(self):
self.reward = None
def step(self,action):
if action == 0:
self.reward = 1
else:
self.reward = 10
return self.reward
class Agent:
def __init__(self):
pass
def act(self):
# 만약에 경험이 20보다 작음 --> 랜덤액션
# 경험이 20보다 크면 --> action = q_tabel.argmax()
pass
def save_experience(self):
# 데이터 저장
pass
def learn(self):
# q_table 을 업데이트하는 과정
pass
class Agent:
def __init__(self):
self.n_experiences = 0
self.action_space = [0,1]
self.action = None
self.reward = None
self.q_table = None
self.actions = collections.deque(maxlen=500)
self.rewards = collections.deque(maxlen=500)
def act(self):
if self.n_experiences < 20:
self.action = np.random.choice(self.action_space)
else:
self.action = self.q_table.argmax()
print(f"버튼{self.action}누름!")
def save_experience(self):
self.actions.append(self.action)
self.rewards.append(self.reward)
self.n_experiences = self.n_experiences + 1
def learn(self):
if self.n_experiences < 20:
pass
else:
= np.array(self.actions)
actions = np.array(self.rewards)
rewards = rewards[actions==0].mean()
q0 = rewards[actions==1].mean()
q1 self.q_table = np.array([q0,q1])
= Bandit()
env = Agent() player
for _ in range(50):
# step1 : agent --> env
player.act()# step2: agnet <-- env
= env.step(player.action)
player.reward # step3: agent: update (save + learn)
player.save_experience()
player.learn()#----#
if player.n_experiences < 20:
pass
else:
= np.array(player.rewards)[-20:]
recent_rewards if recent_rewards.mean() > 9.5:
print("---")
print(f"{player.n_experiences}번만에 게임클리어")
break
버튼1누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
---
36번만에 게임클리어
3. 예비학습 : gym.spaces
-
예시 1
= gym.spaces.Discrete(4)
action_space action_space
Discrete(4)
for _ in range(5)] [action_space.sample()
[3, 0, 3, 0, 0]
0 in action_space
True
4 in action_space
False
-
예시 2
= gym.spaces.MultiDiscrete([4,4])
state_space state_space
MultiDiscrete([4 4])
for _ in range(5)] [state_space.sample()
[array([0, 1]), array([0, 3]), array([0, 3]), array([1, 1]), array([2, 3])]
0,1]) in state_space np.array([
True
3,3]) in state_space np.array([
True
3,4]) in state_space np.array([
False
4. 4x4 Grid World 게임 설명
A. 게임설명
-
문제설명: 4x4 그리드월드에서 상하좌우로 움직이는 에이전트가 목표점에 도달하도록 하는 게임
- 백문이 불여일견: https://claude.ai/public/artifacts/76e13820-2b51-4e7e-a514-00190de17c45 (출처: 클로드)
-
GridWorld에서 사용되는 주요변수
State
: 각 격자 셀이 하나의 상태이며, 에이전트는 이러한 상태 중 하나에 있을 수 있음.Action
: 에이전트는 현재상태에서 다음상태로 이동하기 위해 상,하,좌,우 중 하나의 행동을 취할 수 있음.Reward
: 에이전트가 현재상태에서 특정 action을 하면 얻어지는 보상.Terminated
: 하나의 에피소드가 종료되었음을 나타내는 상태.
B. 시각화
def show(states):
= plt.Figure()
fig = fig.subplots()
ax 4,4]), cmap='bwr',alpha=0.0)
ax.matshow(np.zeros([= ax.scatter(0, 0, color='red', s=500)
sc 0, 0, 'start', ha='center', va='center')
ax.text(3, 3, 'end', ha='center', va='center')
ax.text(# Adding grid lines to the plot
-.5, 4, 1), minor=True)
ax.set_xticks(np.arange(-.5, 4, 1), minor=True)
ax.set_yticks(np.arange(='minor', color='black', linestyle='-', linewidth=2)
ax.grid(which= gym.spaces.MultiDiscrete([4,4])
state_space def update(t):
if states[t] in state_space:
= states[t]
s1,s2 = [s2,s1]
states[t]
sc.set_offsets(states[t])else:
= states[t]
s1,s2 = s1 + 0.5 if s1 < 0 else (s1 - 0.5 if s1 > 3 else s1)
s1 = s2 + 0.5 if s2 < 0 else (s2 - 0.5 if s2 > 3 else s2)
s2 = [s2,s1]
states[t]
sc.set_offsets(states[t])= FuncAnimation(fig,update,frames=len(states))
ani display(IPython.display.HTML(ani.to_jshtml()))
0,0],[1,0],[2,0],[3,0],[4,0]]) # show 사용방법 show([[
5. 4x4 Grid World 환경 구현
class GridWorld:
def __init__(self):
self.a2d = {
0: np.array([0,1]), # →
1: np.array([0,-1]), # ←
2: np.array([1,0]), # ↓
3: np.array([-1,0]) # ↑
}self.state = np.array([0,0])
self.state_space = gym.spaces.MultiDiscrete([4,4])
self.terminated = False
self.reward = None
def reset(self):
self.state = np.array([0,0])
self.reward = None
self.terminated = False
def step(self,action):
self.state = self.state + self.a2d[action]
= self.state
s1,s2 if (s1==3) and (s2==3):
self.reward = 100
self.terminated = True
elif self.state in self.state_space:
self.reward = -1
self.terminated = False
else:
self.reward = -10
self.terminated = True
print(
f"action = {action}\t"
f"state = {self.state - self.a2d[action]} -> {self.state}\t"
f"reward = {self.reward}\t"
f"termiated = {self.terminated}"
)return self.state, self.reward, self.terminated
def reset(self):
self.terminated = False
self.state = np.array([0,0])
return self.state
= GridWorld() env
for t in range(10):
= action_space.sample()
action
env.step(action)if env.terminated:
= env.reset() env.state
action = 0 state = [3 2] -> [3 3] reward = 100 termiated = True
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 0 state = [1 0] -> [1 1] reward = -1 termiated = False
action = 0 state = [1 1] -> [1 2] reward = -1 termiated = False
action = 0 state = [1 2] -> [1 3] reward = -1 termiated = False
action = 3 state = [1 3] -> [0 3] reward = -1 termiated = False
action = 3 state = [0 3] -> [-1 3] reward = -10 termiated = True
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
6. “에이전트 \(\Leftrightarrow\) 환경” 상호작용 구현
-
우리가 구현하고 싶은 기능
.act()
: 액션을 결정 –> 여기서는 그냥 랜덤액션.save_experience()
: 데이터를 저장 –> 여기에 일단 초점을 맞추자.learn()
: 데이터로에서 학습 –> 패스
class RandomAgent:
def __init__(self):
self.n_experiences = 0
self.action_space = gym.spaces.Discrete(4)
#---#
self.state = None
self.action = None
self.reward = None
self.next_state = None
self.termiated = None
#---#
self.states = collections.deque(maxlen=500)
self.actions = collections.deque(maxlen=500)
self.rewards = collections.deque(maxlen=500)
self.next_states = collections.deque(maxlen=500)
self.terminations = collections.deque(maxlen=500)
#---#
#self.q_table = None
def act(self):
self.action = self.action_space.sample()
def save_experience(self):
self.states.append(self.state)
self.actions.append(self.action)
self.rewards.append(self.reward)
self.next_states.append(self.next_state)
self.terminations.append(self.termiated)
self.n_experiences = self.n_experiences + 1
def learn(self):
pass
= RandomAgent()
player = GridWorld() env
for _ in range(50):
# step1 : agent --> env
player.act()# step2: agnet <-- env
= env.step(player.action)
player.next_state, player.reward, player.terminated # step3: agent: update (save + learn)
player.save_experience()
player.learn()# step4: prepare next iterations
= player.next_state
player.state if env.terminated:
= env.reset()
player.state break
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 2 state = [1 0] -> [2 0] reward = -1 termiated = False
action = 1 state = [2 0] -> [ 2 -1] reward = -10 termiated = True
= []
scores = []
playtimes for e in range(1,1000):
= env.reset()
player.state = 0
score #---#
for playtime in range(1,50):
# step1 : agent --> env
player.act()# step2: agnet <-- env
= env.step(player.action)
player.next_state, player.reward, player.terminated # step3: agent: update (save + learn)
player.save_experience()
player.learn()# step4: prepare next iterations
= player.next_state
player.state = score + player.reward
score if env.terminated:
print(f"---에피소드{e}종료---")
break
#---#
scores.append(score)if scores[-1] > 0:
break
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드1종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드2종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 3 state = [1 0] -> [0 0] reward = -1 termiated = False
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드3종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 1 state = [1 0] -> [ 1 -1] reward = -10 termiated = True
---에피소드4종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 2 state = [1 0] -> [2 0] reward = -1 termiated = False
action = 0 state = [2 0] -> [2 1] reward = -1 termiated = False
action = 1 state = [2 1] -> [2 0] reward = -1 termiated = False
action = 1 state = [2 0] -> [ 2 -1] reward = -10 termiated = True
---에피소드5종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 1 state = [0 1] -> [0 0] reward = -1 termiated = False
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드6종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드7종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드8종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드9종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드10종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 3 state = [0 1] -> [-1 1] reward = -10 termiated = True
---에피소드11종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드12종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 2 state = [0 1] -> [1 1] reward = -1 termiated = False
action = 1 state = [1 1] -> [1 0] reward = -1 termiated = False
action = 0 state = [1 0] -> [1 1] reward = -1 termiated = False
action = 3 state = [1 1] -> [0 1] reward = -1 termiated = False
action = 2 state = [0 1] -> [1 1] reward = -1 termiated = False
action = 1 state = [1 1] -> [1 0] reward = -1 termiated = False
action = 0 state = [1 0] -> [1 1] reward = -1 termiated = False
action = 0 state = [1 1] -> [1 2] reward = -1 termiated = False
action = 0 state = [1 2] -> [1 3] reward = -1 termiated = False
action = 2 state = [1 3] -> [2 3] reward = -1 termiated = False
action = 1 state = [2 3] -> [2 2] reward = -1 termiated = False
action = 0 state = [2 2] -> [2 3] reward = -1 termiated = False
action = 2 state = [2 3] -> [3 3] reward = 100 termiated = True
---에피소드13종료---
-1] scores[
87
= [np.array([0,0])]+ list(player.next_states)[-20:]
paths show(paths)