-
Notifications
You must be signed in to change notification settings - Fork 10
/
brain.js
executable file
·298 lines (268 loc) · 10.9 KB
/
brain.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/**
* The file contains solely the Q-learning model for training our flappy bird.
* It takes input from the environment such as the position of the flappy bird,
* the tubes etc and responds back with the appropriate action to take.
*
* Author @nellex
*/
/**
* The Q-table forms the heart of the Q-learning algorithm. Maintained for our
* agent Flappy bird, the table represents the state-action function, i.e. the
* relationship between a set of states (S) and the set of actions (A) =>
* Q[S,A]. For a given state 's' and a given action 'a', Q(s,a) denotes the
* expected reward of doing the action 'a' in the state 's'.
*
* In our learning model, the state of the environment is defined by:
* (1) speedY: The speed of the flappy bird in the Y-axis, i.e. by what rate the
* bird is going up or falling down
* (2) tubeX: The X-coordinate of the next incoming tube, i.e. how far the next
* tube is from the flappy bird
* (3) diffY: We define the ideal position from which the flappy bird should pass
* through to be the very middle of vertical space between the two tubes. The
* parameter 'diffY' denotes the difference between the Y-coordinate of the flappy
* bird to the Y-coordinate of our ideal passage position, i.e. how down below or
* above our flappy bird is from where it should pass from the tube.
*/
var Q_table = {};
/**
* The action set comprises of:
* (1) Stay: Take no action, and just go with the flow of the gravity
* (2) Jump: Push the flappy bird upwards
*/
var actionSet = {
STAY : 0,
JUMP : 1
};
/**
* Defining the parameters for our Q-learning model,
* (1) Learning rate, alpha: Ranging between [0,1], it determines how quickly should
* the flappy bird override it's old learned actions with the new ones for the
* corresponding state
* (2) Discount factor, gamma: Used for determining the importance of future reward.
*
* In our game, if the flappy bird fails to clear the tube, the action which it
* took recently previously will be penalized more than the action which it took 10
* steps ago. This is because it's the recent actions which has a more influence on
* the success of the bird.
*/
var gamma = 0.8; // Discounted rewards
var alpha = 0.1; // Learning rate
// Frame buffer for mainting the state-action pairs in the current episode
var frameBuffer = [];
// Number of frames in the current frame buffer
var episodeFrameCount = 0;
// Flag to determine if the current episode is still ongoing or is completed by
// maintaing a index to the next incoming tube
var targetTubeIndex;
// The tube which the bird must clear next
var targetTube;
// To maintain the count on the number of trials
var trials = 0;
/**
* Function to lookup the estimated Q-value (reward) in the Q-table for a given
* state-action pair
* @param {*} state State of the environment as described above
* @param {*} action The action to be taken
*/
function getQ(state, action) {
var config = [ state.diffY, state.speedY, state.tubeX, action ];
if (!(config in Q_table)) {
// If there's no entry in the given Q-table for the given state-action
// pair, return a default reward score as 0
return 0;
}
return Q_table[config];
}
/**
* Function to update the Q-value (reward) entry for the given state-action pair
* @param {*} state The state of the environment
* @param {*} action The action taken for the given state
* @param {*} reward The reward to be awarded for the state-action pair
*/
function setQ(state, action, reward) {
var config = [ state.diffY, state.speedY, state.tubeX, action ];
if (!(config in Q_table)) {
Q_table[config] = 0;
}
Q_table[config] += reward;
}
/**
* Function responsible for selecting the appropriate action corresponding to
* the given state The action which has a higher Q-value for the given state is
* 'generally' executed
* @param {*} state
*/
function getAction(state) {
// Why always follow the rules? Once in a while (1/100000), our flappy bird
// takes a random decision without looking up the Q-table to explore a new
// possibility. This is to help the flappy bird to not get stuck on a single
// path.
var takeRandomDecision = Math.ceil(Math.random() * 100000)%90001;
if (takeRandomDecision == 0) {
console.log("Going random baby!");
// 1 out of 4 times, it'll take a decision to jump
var shouldJump = ((Math.random() * 100 )%4 == 0);
if (shouldJump) {
return actionSet.JUMP;
} else {
return actionSet.STAY;
}
}
// Lookup the Q-table for rewards corresponding to Jump and Stay action for
// the given state
var rewardForStay = getQ(state, actionSet.STAY);
var rewardForJump = getQ(state, actionSet.JUMP);
if (rewardForStay > rewardForJump) {
// If reward for Stay is higher, command the flappy bird to stay
return actionSet.STAY;
} else if (rewardForStay < rewardForJump) {
// If reward for Jump is higher, command the flappy bird to jump
return actionSet.JUMP;
} else {
// This is the case when the reward for both the actions are the same In
// such a case, we determine randomly the action to be taken Generally, the
// probability of jumping is lower as compared to stay to mimic the natural
// scenario We press jump much less occasionally than we let the flappy bird
// fall
var shouldJump = (Math.ceil( Math.random() * 100 )%25 == 0);
if (shouldJump) {
return actionSet.JUMP;
} else {
return actionSet.STAY;
}
}
}
/**
* Function responsible for rewarding the flappy bird according to its
* performance One thing to note here is that we found the behaviour of our
* Flappy Bird to be highly episodic. As soon as your flappy bird clears one
* obstacle, we terminate our episode there and then and reward it postively A
* new episode is then started for the next obstacle i.e. the next tube which is
* treated completely independent from the previous one
*
* We reward the flappy bird at the end of an episode, hence we maintain a frame
* buffer to store the state-action pairs in a sequential order and decide upon
* the reward to be awarded for that state-action on the completion of the
* episode
* @param {*} reward The amound of reward to be awarded to the Flappy Bird
* @param {*} wasSuccessful Determines if the reward to be awarded should be
* negative or positive depending upon if the episode was completed successfully
* or not
*/
function rewardTheBird(reward, wasSuccessful) {
// Minumun number of frames to be maintained in the frame buffer for the
// episode (for maintaining the state-action sequecne tail)
var minFramSize = 5;
// Tolerable deviation from the ideal passage position between the tubes in px
var theta = 1;
var frameSize = Math.max(minFramSize, episodeFrameCount);
// Iterate over the state-action sequence trail, from the most recent to the
// most oldest
for (var i = frameBuffer.length-2; i >= 0 && frameSize > 0; i--) {
var config = frameBuffer[i];
var state = config.env;
var action = config.action;
// The reward for the state is influenced by how close the flappy bird was
// from the ideal passage position
var rewardForState = (reward - Math.abs(state.diffY));
// Determine if the reward for given state-action pair should be positive or
// negative
if (!wasSuccessful) {
if (state.diffY >= theta && action == actionSet.JUMP) {
// If the bird was above the ideal passage position and it still decided
// to jump, reward negatively
rewardForState = -rewardForState;
} else if(state.diffY <= -theta && action == actionSet.STAY) {
// If the bird was below the ideal passage position and it still decided
// to not jump (stay), reward negatively
rewardForState = -rewardForState;
} else {
// The bird took the right decision, so don't award it negatively
rewardForState = +0.5;
}
}
// Update the Q-value for the state-action pair according to the Q-learning
// algorithm Ref: https://en.wikipedia.org/wiki/Q-learning
var futureState = frameBuffer[i+1].env;
var optimalFutureValue = Math.max(getQ(futureState, actionSet.STAY),
getQ(futureState, actionSet.JUMP));
var updateValue = alpha*(rewardForState + gamma * optimalFutureValue - getQ(state, action));
setQ(state, action, updateValue)
frameSize--;
}
// Allocating reward is complete, hence clear the frame buffer but still try to
// maintain the most recent 5 state-action pair Since the last actions taken in
// the previous episode affects the position of the bird in the next episdoe
frameBuffer = frameBuffer.slice(Math.max(frameBuffer.length-minFramSize, 1));
episodeFrameCount = 0;
}
/**
* Function to negatively reward the flappy bird when the game is over
*/
function triggerGameOver() {
var reward = 100;
rewardTheBird(reward, false);
console.log( "GameOver:", score, Object.keys(Q_table).length, trials );
// Reset the episode flag
targetTubeIndex = -1;
episodeFrameCount = 0;
trials++;
}
/**
* This function is executed for every step in the game and is responsible for
* forming the state and delegating the action to be taken back to our flappy
* bird
*/
function nextStep() {
// If the game hasn't started yet then do nothing
if (gameState != GAME)
return;
// Logic to determine if the Flappy Bird successfully surpassed the tube The
// changing of the targetTubeIndex denotes the completion of an episode
if (birdX < tubes[0].x + 3 && (tubes[0].x < tubes[1].x || tubes[1].x + 3 < birdX)) {
targetTube = tubes[0];
if (targetTubeIndex == 1) {
// The target tube changed from [1] to [0], which means the tube[1] was
// crossed successfully Hence reward the bird positively
rewardTheBird(5, true);
}
targetTubeIndex = 0;
} else {
targetTube = tubes[1];
if (targetTubeIndex == 0) {
// The target tube changed from index [0] to [1], which means the tube[0]
// was crossed successfully Hence reward the bird positively
rewardTheBird(5, true);
}
targetTubeIndex = 1;
}
// We'll take no action if the tube is too far from the bird
if (targetTube.x - birdX > 28) {
return;
}
// Else, we'll form our state from the current environment parameters to be
// ingested by our algorithm
var state = {
speedY: Math.round(birdYSpeed * 100),
tubeX: targetTube.x,
diffY: (targetTube.y+17+6) - (birdY+1)
};
// Query the Q-table to determine the appropriate action to be taken for the
// current state
var actionToBeTaken = getAction(state);
// Push the state-action pair to the frame buffer so what we can determine the
// reward for it later on
var config = {
env: state,
action: actionToBeTaken
};
frameBuffer.push(config);
episodeFrameCount++;
// Delegate the action to our flappy bird
if (actionToBeTaken == actionSet.JUMP) {
birdYSpeed = -1.4;
} else {
// For stay action, we do nothing but just let the bird go down due to
// gravity
}
}