13-2. 강화학습(Bandit 환경 설계 및 풀이, 4x4 Grid World 게임설명 및 구현)

Author

이상민

Published

June 2, 2025

1. imports

import gymnasium as gym
#---#
import numpy as np
import collections
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import IPython

2. Bandit 환경 설계 및 풀이

A. 간단히 개념만 실습

action_space = [0,1] 
actions_deque = collections.deque(maxlen=500)
rewards_deque =  collections.deque(maxlen=500)
#---#
for _ in range(10):
    action = np.random.choice(action_space)
    if action == 1:
        reward = 10 
    else:
        reward = 1
    actions_deque.append(action)
    rewards_deque.append(reward)
actions_deque
deque([0, 0, 1, 1, 1, 1, 1, 1, 1, 0], maxlen=500)
rewards_deque
deque([1, 1, 10, 10, 10, 10, 10, 10, 10, 1], maxlen=500)
actions_numpy = np.array(actions_deque)
rewards_numpy = np.array(rewards_deque)
q0 = rewards_numpy[actions_numpy == 0].mean()
q1 = rewards_numpy[actions_numpy == 1].mean()
q_table = np.array([q0,q1])
q_table
array([ 1., 10.])
action = q_table.argmax()
for _ in range(5):
    action = q_table.argmax()
    if action == 1:
        reward = 10 
    else:
        reward = 1
    actions_deque.append(action)
    rewards_deque.append(reward)
    actions_numpy = np.array(actions_deque)
    rewards_numpy = np.array(rewards_deque)    
    q0 = rewards_numpy[actions_numpy == 0].mean()
    q1 = rewards_numpy[actions_numpy == 1].mean()
    q_table = np.array([q0,q1])
actions_numpy
array([0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])
rewards_numpy
array([ 1,  1, 10, 10, 10, 10, 10, 10, 10,  1, 10, 10, 10, 10, 10])

B. 클래스를 이용한 구현

class Bandit:
    def __init__(self):
        self.reward = None 
    def step(self,action):
        if action == 0:
            self.reward = 1
        else: 
            self.reward = 10 
        return self.reward 
class Agent:
    def __init__(self):
        pass 
    def act(self):
        # 만약에 경험이 20보다 작음 --> 랜덤액션 
        # 경험이 20보다 크면 --> action = q_tabel.argmax()
        pass 
    def save_experience(self):
        # 데이터 저장 
        pass 
    def learn(self):
        # q_table 을 업데이트하는 과정 
        pass

class Agent:
    def __init__(self):
        self.n_experiences = 0 
        self.action_space = [0,1]
        self.action = None
        self.reward = None 
        self.q_table = None 
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
    def act(self):
        if self.n_experiences < 20:
            self.action = np.random.choice(self.action_space)
        else: 
            self.action = self.q_table.argmax()
        print(f"버튼{self.action}누름!")
    def save_experience(self):
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.n_experiences = self.n_experiences + 1 
    def learn(self):
        if self.n_experiences < 20:
            pass
        else: 
            actions = np.array(self.actions)
            rewards = np.array(self.rewards)
            q0 = rewards[actions==0].mean()
            q1 = rewards[actions==1].mean()
            self.q_table = np.array([q0,q1])
env = Bandit()
player = Agent()
for _ in range(50):
    # step1 : agent --> env 
    player.act()
    # step2: agnet <-- env 
    player.reward = env.step(player.action)
    # step3: agent: update (save + learn) 
    player.save_experience() 
    player.learn()
    #----#
    if player.n_experiences < 20: 
        pass 
    else: 
        recent_rewards = np.array(player.rewards)[-20:]
        if recent_rewards.mean() > 9.5:
            print("---")
            print(f"{player.n_experiences}번만에 게임클리어")
            break 
버튼1누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
---
36번만에 게임클리어

3. 예비학습 : gym.spaces

- 예시 1

action_space = gym.spaces.Discrete(4) 
action_space 
Discrete(4)
[action_space.sample() for _ in range(5)]
[3, 0, 3, 0, 0]
0 in action_space
True
4 in action_space
False

- 예시 2

state_space = gym.spaces.MultiDiscrete([4,4])
state_space
MultiDiscrete([4 4])
[state_space.sample() for _ in range(5)]
[array([0, 1]), array([0, 3]), array([0, 3]), array([1, 1]), array([2, 3])]
np.array([0,1]) in state_space
True
np.array([3,3]) in state_space
True
np.array([3,4]) in state_space
False

4. 4x4 Grid World 게임 설명

A. 게임설명

- 문제설명: 4x4 그리드월드에서 상하좌우로 움직이는 에이전트가 목표점에 도달하도록 하는 게임

- GridWorld에서 사용되는 주요변수

  1. State: 각 격자 셀이 하나의 상태이며, 에이전트는 이러한 상태 중 하나에 있을 수 있음.
  2. Action: 에이전트는 현재상태에서 다음상태로 이동하기 위해 상,하,좌,우 중 하나의 행동을 취할 수 있음.
  3. Reward: 에이전트가 현재상태에서 특정 action을 하면 얻어지는 보상.
  4. Terminated: 하나의 에피소드가 종료되었음을 나타내는 상태.

B. 시각화

def show(states):
    fig = plt.Figure()
    ax = fig.subplots()
    ax.matshow(np.zeros([4,4]), cmap='bwr',alpha=0.0)
    sc = ax.scatter(0, 0, color='red', s=500)  
    ax.text(0, 0, 'start', ha='center', va='center')
    ax.text(3, 3, 'end', ha='center', va='center')
    # Adding grid lines to the plot
    ax.set_xticks(np.arange(-.5, 4, 1), minor=True)
    ax.set_yticks(np.arange(-.5, 4, 1), minor=True)
    ax.grid(which='minor', color='black', linestyle='-', linewidth=2)
    state_space = gym.spaces.MultiDiscrete([4,4])
    def update(t):
        if states[t] in state_space:
            s1,s2 = states[t]
            states[t] = [s2,s1]
            sc.set_offsets(states[t])
        else:
            s1,s2 = states[t]
            s1 = s1 + 0.5 if s1 < 0 else (s1 - 0.5 if s1 > 3 else s1)
            s2 = s2 + 0.5 if s2 < 0 else (s2 - 0.5 if s2 > 3 else s2)
            states[t] = [s2,s1]       
            sc.set_offsets(states[t])
    ani = FuncAnimation(fig,update,frames=len(states))
    display(IPython.display.HTML(ani.to_jshtml()))
show([[0,0],[1,0],[2,0],[3,0],[4,0]]) # show 사용방법

5. 4x4 Grid World 환경 구현

class GridWorld:
    def __init__(self):
        self.a2d = {
            0: np.array([0,1]), # →
            1: np.array([0,-1]), # ←  
            2: np.array([1,0]),  # ↓
            3: np.array([-1,0])  # ↑
        }
        self.state = np.array([0,0])
        self.state_space = gym.spaces.MultiDiscrete([4,4])
        self.terminated = False 
        self.reward = None 
    def reset(self):
        self.state = np.array([0,0])
        self.reward = None 
        self.terminated = False
    def step(self,action):
        self.state = self.state + self.a2d[action]
        s1,s2 = self.state 
        if (s1==3) and (s2==3):
            self.reward = 100
            self.terminated = True 
        elif self.state in self.state_space:
            self.reward = -1 
            self.terminated = False
        else: 
            self.reward = -10 
            self.terminated = True 
        print(
            f"action = {action}\t"
            f"state = {self.state - self.a2d[action]} -> {self.state}\t"
            f"reward = {self.reward}\t"
            f"termiated = {self.terminated}"
        )
        return self.state, self.reward, self.terminated
    def reset(self):
        self.terminated = False 
        self.state = np.array([0,0])
        return self.state
env = GridWorld()
for t in range(10):
    action = action_space.sample()
    env.step(action)
    if env.terminated:
        env.state = env.reset()
action = 0  state = [3 2] -> [3 3]  reward = 100    termiated = True
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 0  state = [1 0] -> [1 1]  reward = -1 termiated = False
action = 0  state = [1 1] -> [1 2]  reward = -1 termiated = False
action = 0  state = [1 2] -> [1 3]  reward = -1 termiated = False
action = 3  state = [1 3] -> [0 3]  reward = -1 termiated = False
action = 3  state = [0 3] -> [-1  3]    reward = -10    termiated = True
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False

6. “에이전트 \(\Leftrightarrow\) 환경” 상호작용 구현

- 우리가 구현하고 싶은 기능

  • .act(): 액션을 결정 –> 여기서는 그냥 랜덤액션
  • .save_experience(): 데이터를 저장 –> 여기에 일단 초점을 맞추자
  • .learn(): 데이터로에서 학습 –> 패스
class RandomAgent:
    def __init__(self):
        self.n_experiences = 0 
        self.action_space = gym.spaces.Discrete(4)
        #---#
        self.state = None 
        self.action = None
        self.reward = None 
        self.next_state = None 
        self.termiated = None
        #---#
        self.states = collections.deque(maxlen=500)
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
        self.next_states = collections.deque(maxlen=500)
        self.terminations = collections.deque(maxlen=500)
        #---#
        #self.q_table = None 
    def act(self):
        self.action = self.action_space.sample()
    def save_experience(self):
        self.states.append(self.state)
        self.actions.append(self.action)        
        self.rewards.append(self.reward)
        self.next_states.append(self.next_state)
        self.terminations.append(self.termiated)
        self.n_experiences = self.n_experiences + 1 
    def learn(self):
        pass
player = RandomAgent()
env = GridWorld()
for _ in range(50):
    # step1 : agent --> env 
    player.act()
    # step2: agnet <-- env 
    player.next_state, player.reward, player.terminated = env.step(player.action)
    # step3: agent: update (save + learn) 
    player.save_experience() 
    player.learn()
    # step4: prepare next iterations 
    player.state = player.next_state
    if env.terminated:
        player.state = env.reset()
        break
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 2  state = [1 0] -> [2 0]  reward = -1 termiated = False
action = 1  state = [2 0] -> [ 2 -1]    reward = -10    termiated = True
scores = [] 
playtimes = []
for e in range(1,1000):
    player.state = env.reset()
    score = 0 
    #---#
    for playtime in range(1,50):
        # step1 : agent --> env 
        player.act()
        # step2: agnet <-- env 
        player.next_state, player.reward, player.terminated = env.step(player.action)
        # step3: agent: update (save + learn) 
        player.save_experience() 
        player.learn()
        # step4: prepare next iterations 
        player.state = player.next_state
        score = score + player.reward
        if env.terminated:
            print(f"---에피소드{e}종료---")
            break
    #---#
    scores.append(score)
    if scores[-1] > 0:
        break
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드1종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드2종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 3  state = [1 0] -> [0 0]  reward = -1 termiated = False
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드3종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 1  state = [1 0] -> [ 1 -1]    reward = -10    termiated = True
---에피소드4종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 2  state = [1 0] -> [2 0]  reward = -1 termiated = False
action = 0  state = [2 0] -> [2 1]  reward = -1 termiated = False
action = 1  state = [2 1] -> [2 0]  reward = -1 termiated = False
action = 1  state = [2 0] -> [ 2 -1]    reward = -10    termiated = True
---에피소드5종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 1  state = [0 1] -> [0 0]  reward = -1 termiated = False
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드6종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드7종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드8종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드9종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드10종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 3  state = [0 1] -> [-1  1]    reward = -10    termiated = True
---에피소드11종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드12종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 2  state = [0 1] -> [1 1]  reward = -1 termiated = False
action = 1  state = [1 1] -> [1 0]  reward = -1 termiated = False
action = 0  state = [1 0] -> [1 1]  reward = -1 termiated = False
action = 3  state = [1 1] -> [0 1]  reward = -1 termiated = False
action = 2  state = [0 1] -> [1 1]  reward = -1 termiated = False
action = 1  state = [1 1] -> [1 0]  reward = -1 termiated = False
action = 0  state = [1 0] -> [1 1]  reward = -1 termiated = False
action = 0  state = [1 1] -> [1 2]  reward = -1 termiated = False
action = 0  state = [1 2] -> [1 3]  reward = -1 termiated = False
action = 2  state = [1 3] -> [2 3]  reward = -1 termiated = False
action = 1  state = [2 3] -> [2 2]  reward = -1 termiated = False
action = 0  state = [2 2] -> [2 3]  reward = -1 termiated = False
action = 2  state = [2 3] -> [3 3]  reward = 100    termiated = True
---에피소드13종료---
scores[-1]
87
paths = [np.array([0,0])]+ list(player.next_states)[-20:]
show(paths)