RL with python/Python example code
[RL-Python] Contextual Bandit with simple Neural Network
achrxme
2023. 8. 22. 15:38
import matplotlib.pyplot as plt
import numpy as np
import random
import torch
class ContextBandit:
def __init__(self, arms):
self.arms = arms
self.init_distribution(arms)
self.update_state()
def init_distribution(self, arms):
self.bandit_matrix = np.random.rand(arms, arms) # 10 states for 10 arms
def reward(self, prob):
reward = 0
for i in range(self.arms):
if random.random() < prob:
reward += 1
return reward
def get_state(self):
return self.state
def update_state(self):
self.state = np.random.randint(0, self.arms)
def get_reward(self, arm):
return self.reward(self.bandit_matrix[self.get_state()][arm])
def choose_arm(self, arm):
reward = self.get_reward(arm)
self.update_state()
return reward
def one_hot(N, pos, val=1):
one_hot_vec = np.zeros(N)
one_hot_vec[pos] = val
return one_hot_vec
def softmax(av, tau=1.12):
softm = (np.exp(av / tau) / np.sum(np.exp(av / tau)))
return softm
arms = 10
# N: batch size, D_in: input vector dim, H: hidden layer dim, D_out: output vector dim
N, D_in, H, D_out = 1, arms, 100, arms
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, H),
torch.nn.ReLU(),
torch.nn.Linear(H, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
torch.nn.ReLU(),
)
loss_fn = torch.nn.MSELoss()
env = ContextBandit(arms)
def train(env, epochs=50000, learning_rate=1e-2):
cur_state = torch.Tensor(one_hot(arms, env.get_state()))
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
rewards = []
for i in range(epochs):
print(i)
y_pred = model(cur_state) # expected rewards (for 10 arms)
av_softmax = softmax(y_pred.data.numpy(), tau=2.0) # expected reward 를 확률 분포로 변환
av_softmax /= av_softmax.sum() # 확률의 합 = 1 되도록 정규화
choice = np.random.choice(arms, p=av_softmax) # 확률 분포에 따라 action 선택
cur_reward = env.choose_arm(choice) # 해당 action에 따른 Reward를 얻음
one_hot_reward = y_pred.data.numpy().copy() # 현재 reward 받기 전 y_pred 복사
one_hot_reward[choice] = cur_reward # 현재 reward를 반영 하여 y_pred 변경
reward = torch.Tensor(one_hot_reward)
rewards.append(cur_reward)
loss = loss_fn(y_pred, reward) # 현재 reward 받기 전 y_pred와 reward 받은거 반영한 'reward' tensor 사이의 loss 연산
optimizer.zero_grad()
loss.backward()
optimizer.step()
cur_state = torch.Tensor(one_hot(arms, env.get_state())) # 현재 state 갱신
return np.array(rewards)
def running_mean(x, N=50):
c = x.shape[0] - N
y = np.zeros(c)
conv = np.ones(N)
for i in range(c):
y[i] = (x[i:i+N] @ conv)/N
return y
rewards = train(env)
plt.plot(running_mean(rewards, N=500))
plt.show()