RL with python/Python example code8 [RL-PyTorch] N Step Actor-Critic Mote Carlo : 각 에피소드의 끝에서 학습 Temporal Diffence (TD, Fully online learning) : 매 스텝마다 학습 -> Bootstrapping 적용 시 bias 커질 수 있음 N Step TD : N 스텝 마다 학습 -> Bootstrapping 적용해도 TD보다는 편향이 적음 *Bootstrapping : 예측에 기반해서 또 다른 예측을 수행 -> 예측을 하기 전에 최대한 많은 데이터를 모으는 것이 바람직함 # Multi step Temporal Difference import torch from torch import nn from torch import optim import numpy as np from torch.nn import functional as.. 2023. 9. 2. [Python] Multi processing # Multi processing import multiprocessing as mp import numpy as np def square(x): return np.square(x) x_ = np.arange(72) # print(x_) cpu_n = mp.cpu_count() print(cpu_n) pool = mp.Pool(cpu_n) # CPU 개수와 같은 다중 프로세싱 풀을 만듬 squared = pool.map(square, [x_[3*i:3*i+3] for i in range(cpu_n)]) #Pool의 map method를 활용해 square 함수를 각 배열에 적용 -> 취 print(squared) import multiprocessing as mp import numpy as np def.. 2023. 9. 1. [RL-PyTorch] Policy gradient method, REINFORCE algorithm import numpy as np import torch import gym from matplotlib import pyplot as plt def running_mean(x, N=50): kernel = np.ones(N) conv_len = x.shape[0] - N y = np.zeros(conv_len) for i in range(conv_len): y[i] = kernel @ x[i:i + N] y[i] /= N return y env = gym.make("CartPole-v1", render_mode="human") l1 = 4 # the length of input data = 4 l2 = 150 # dimension of hidden layer = 150 l3 = 2 # output : .. 2023. 8. 28. [RL-PyTorch] Deep Q-learning - Experience replay import numpy as np import torch from Gridworld import Gridworld from IPython.display import clear_output import random from matplotlib import pylab as plt from collections import deque l1 = 64 l2 = 150 l3 = 100 l4 = 4 model = torch.nn.Sequential( torch.nn.Linear(l1, l2), torch.nn.ReLU(), torch.nn.Linear(l2, l3), torch.nn.ReLU(), torch.nn.Linear(l3,l4) ) loss_fn = torch.nn.MSELoss() learning_rate.. 2023. 8. 22. [RL-PyTorch] Basic Deep Q-learning import numpy as np import torch from Gridworld import Gridworld from IPython.display import clear_output import random from matplotlib import pylab as plt l1 = 64 l2 = 150 l3 = 100 l4 = 4 model = torch.nn.Sequential( torch.nn.Linear(l1, l2), torch.nn.ReLU(), torch.nn.Linear(l2, l3), torch.nn.ReLU(), torch.nn.Linear(l3, l4) ) loss_fn = torch.nn.MSELoss() learning_rate = 1e-3 optimizer = torch.opt.. 2023. 8. 22. [RL-Python] Contextual Bandit with simple Neural Network import matplotlib.pyplot as plt import numpy as np import random import torch class ContextBandit: def __init__(self, arms): self.arms = arms self.init_distribution(arms) self.update_state() def init_distribution(self, arms): self.bandit_matrix = np.random.rand(arms, arms) # 10 states for 10 arms def reward(self, prob): reward = 0 for i in range(self.arms): if random.random() < prob: reward += 1.. 2023. 8. 22. [RL-Python] Multi-armed bandit - softmax import numpy as np from scipy import stats import random import matplotlib.pyplot as plt def get_reward(prob, n=10): reward = 0 for i in range(n): if random.random() 이걸 이용 해서 시행 수 + 1, 새.. 2023. 8. 21. [RL-Python] Multi-armed bandit - Epsilon Greedy import numpy as np from scipy import stats import random import matplotlib.pyplot as plt def get_reward(prob, n=10): reward = 0 for i in range(n): if random.random() 이걸 이용 해서 시행 수 + 1, 새로운 reward 포.. 2023. 8. 21. 이전 1 다음