-
Notifications
You must be signed in to change notification settings - Fork 31
/
FL_Q-table_Stochastic.py
66 lines (54 loc) · 1.78 KB
/
FL_Q-table_Stochastic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding: utf-8 -*-
import gym
import numpy as np
import matplotlib.pyplot as plt
import random as pr
env = gym.make('FrozenLake-v0')
# env.monitor.start('tmp/Frozenlake8x8-0.2', force= True)
# Q-Table 초기화
Q = np.zeros([env.observation_space.n, env.action_space.n])
num_episodes = 2000
discount = 0.99
learning_rate = 0.85
# reward 값과 state 값들을 저장 해놓을 list
rList = []
sList = []
# Q값이 모두 같을때 랜덤한 action을 구해주기 위한 함수
def rargmax(vector):
m = np.amax(vector)
indices = np.nonzero(vector == m)[0]
return pr.choice(indices)
for i in range(num_episodes):
# Environment 초기화와 변수 초기화
s = env.reset()
rAll = 0
d = False
j = 0
sList = []
e = 1. / ((i / 10) + 1)
# The Q-Table 알고리즘
while not d and j < 250:
j += 1
# 가장 Q값이 높은 action을 결정함
# exploration 을 통한 랜덤한 움직임 결정
if e > np.random.rand(1):
a = env.action_space.sample()
else:
a = rargmax(Q[s, :])
# action을 통해서 next_state, reward, done, info를 받아온다
s1, r, d, _ = env.step(a)
if r == 1:
print("episode : ",i," state record : " ,sList)
# Q-Learning
# discount factor를 적용하여 최단거리로 학습을 할 수 있음(미래에 대한 가중치)
Q[s, a] = Q[s,a]*(1-learning_rate) + learning_rate * (r + discount * np.max(Q[s1, :]))
s = s1
rAll = rAll + r
sList.append(s)
rList.append(rAll)
print ("Final Q-Table Values")
print (" left down right up")
print (Q)
print("성공한 확률 : ", len(rList) / num_episodes)
plt.bar(range(len(rList)), rList, color="Blue")
plt.show()