-
Notifications
You must be signed in to change notification settings - Fork 2
/
HDVtutorial.jl
2147 lines (1689 loc) · 81.2 KB
/
HDVtutorial.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
### A Pluto.jl notebook ###
# v0.19.9
using Markdown
using InteractiveUtils
# ╔═╡ 366fc4c8-2d15-11ed-18cd-c14f15c96f6e
using LinearAlgebra, Plots, StatsBase, SparseArrays, Colors, Random, PlutoUI
# ╔═╡ 3cdc0adf-2ba6-4213-b644-907533de412f
using DataFrames, CSV
# ╔═╡ 7f6763c0-c440-4e9d-a514-98ec4273bd57
begin
using FASTX
io = open("data/sapiens_yeast_proteins.fasta", "r")
reads = [(description(rec), sequence(rec)) for rec in FASTAReader(io)]
headers = first.(reads)
sequences = last.(reads)
n_seqs = length(sequences)
species = [occursin("YEAST", h) ? "YEAST" : "HUMAN" for h in headers]
humane = species .== "HUMAN"
end;
# ╔═╡ f6b4fc92-51fa-42ba-8f3e-6f81bc6d8624
md"""
# Tutorial on Hyperdimensional Computing
**Michiel Stock**
September 2022
"""
# ╔═╡ a3820f10-9930-4dba-85aa-80a09744578d
md"""
## Building a brain for dummies
Artificial neural networks, despite the name, are still very different from how brains work. Most networks are trained using top-down gradient-based methods that are not remotely biologically realistic. Training a state-of-the-art deep learning model such as GTP-3 or DALL-E2 can cost millions of dollars and requires specific infrastructure to deploy. Meanwhile, our brains generate general intelligence, trained in an unsupervised way. The brain has an energy consumption of about 20 Watt, which is about three cans of coke a day. Brains are remarkably efficient.
Hyperdimensional computing (HDC) is a relatively new approach to artificial intelligence. It tries to mimic some aspects of brain functioning more closely. Here, we compute with hyperdimensional vectors (HDV), which is merely a fancy name for vectors of huge dimensionality, typically 10,000. Each HDV can represent a concept, for example, a word in a language, a position along an axis, an ingredient in a recipe or an atom in a molecule. Using a set of basic, well-chosen operations, one can combine these atomic vectors into new vectors: from words to sentences, from coordinates to a position in space, from ingredients to a recipe. These rules allow the user to create *structured* and *hierarchical* representations of whatever they want to model.
Hyperdimensional computing exploits the power of high dimensions by smearing out the information over the complete vector. A distributed, holographic representation allows building a remarkably robust and efficient learning system. Because HDC can be computed using efficient bit-operations, it can be made very energy-efficient. Furthermore, a basic HDC system is relatively easy to implement, as we will see.
"""
# ╔═╡ d3f992f2-bd1e-4544-809f-b9fab1c11005
TableOfContents()
# ╔═╡ 59f5ad60-4554-462e-9a7b-1b7e465b720b
md"""## Building blocks of HDC
To build a HDC system we need the following building blocks:
1) HDVs themselves. These HDVs are just ordinary vectors.
2) Suited arithmetic operations to manipulate these vectors.
3) Metrics to compute similarities between HDVs.
"""
# ╔═╡ 09efbacf-8ee9-46c1-a4a7-11921fbc783b
const N = 10_000
# ╔═╡ 0424b37b-64f6-48c6-8e61-85b6e62fc93c
md"""
### Generating hyperdimensional vectors
Hyperdimensional vectors are nothing more than vectors of a very high dimension. The dimensionality should be sufficiently large that one can comfortably store all the concepts of the system of interest. For our system to be robust, the space of vectors should also be large enough that *a randomly-drawn vector is almost surely unrelated to meaningful vectors*. In this notebook, we work with a dimensionality of $N =$ $N.
"""
# ╔═╡ b5358dae-d136-4343-a3d9-d58ab5f1a79c
md"""
The nature of the vectors, whether filled with real, binary or bipolar (i.e. -1 and 1's), is less critical than the vectors being *large*. Of course, the vectors' nature will determine the associated operators' choice.
Typically, one uses either binary vectors or bipolar vectors. The former can use highly efficient bit operations, and the latter has elementary mathematics. We will choose the latter. To ease the explanation, I will still refer to the individual elements of such vectors as 'bits'.
We assign a randomly-chosen HDV to atomic concepts. Generating a random HDV is easy.
"""
# ╔═╡ 0418d5fd-df04-4d56-ba7c-d45d18aa161d
hdv() = rand((-1,1), N)
# ╔═╡ 49018537-b529-4d07-b1cb-d34fe58294f0
md"We expect about half of the elements of two randomly-chosen vectors to match."
# ╔═╡ 0c72733c-897b-4d9b-84f7-98efe2f197de
md"""
Our two randomly generated vectors show that about half of the $N$ elements match. The expected value is $N/2$, with a variance ($\sigma^2$) of $N/4$ and thus the standard deviation ($\sigma$) is $\sqrt{N}/2$.
> Any two randomly chosen HDV likely share between $(N-3\sqrt{N})$ and $(N+3\sqrt{N})$ elements.
"""
# ╔═╡ f34d034a-942b-41a0-92ec-23608a43f778
md"""
When we need a set of `n` vectors, it makes sense to generate a matrix in one go.
"""
# ╔═╡ 4c647ad6-3e70-48f6-b778-8f9a34d58a6f
hdv(n) = rand((-1,1), n, N)
# ╔═╡ 67dc0274-ae49-4262-ab9d-a7716ab7e851
x = hdv() # a HDV!
# ╔═╡ 8123aaec-aa78-495e-83a1-3a4f4b244000
y = hdv() # another one!
# ╔═╡ ea146895-6d82-4dcb-a24d-0dae70ae39b9
sum(x.==y)
# ╔═╡ 7ea554a1-5efb-461f-9654-2abff0dc80b5
V = hdv(10) # each row is a HDV
# ╔═╡ 1a7bbd06-b6d8-4e0a-a6d7-cb082b9bb69d
md"""
### Bundling
Our first operation is **bundling** or aggregation. This operation combines two or more HDVs in a new HDV that *is similar to all elements in the set*. For bipolar vectors, the element-wise majority fits the bill. Note that when we bundle two HDVs, many of the elements will be set to zero, indicating that the corresponding elements of the parents were in disagreement.
"""
# ╔═╡ a40b8155-34f6-47cc-aaa0-82069ca2dfb5
bundle(U::Matrix{Int}) = sum(U, dims=1)[:] .|> sign
# ╔═╡ eebcab6e-a7e9-4cfd-9e6e-3e39c5d4b8f6
bundle(xs::Vector{Int}...) = reduce(.+, xs) .|> sign
# ╔═╡ 119cd151-26b8-4d4e-a919-88c54cac8616
bundle(x, y)
# ╔═╡ adf79e3a-26f3-4afe-bd14-fc48880a29ec
bundle(V) # bundling makes most sense to find agreement among several HDVs
# ╔═╡ 3578bb7d-353d-4f48-9f50-50b4aaaad19d
md"""
### Binding
The second operator is **binding**: combining two vectors in a new vector different from both. Bundling encodes an *interaction* between two concepts. For bipolar vectors, element-wise multiplication has the desired properties. When two elements are in agreement ((-1, -1) or (1, 1)), the result will be 1. When they are in disagreement ((-1, 1) or (-1, 1)), the result will be -1. One typically uses the XOR function for binary vectors, generating a `true` only when the input bits differ.
"""
# ╔═╡ f8170a5b-4c19-4c64-ad3a-ab82a8b448c6
bind(xs::Vector{Int}...) = reduce(.*, xs)
# ╔═╡ 591e5938-3bba-4b72-8d20-0848884e732b
bind(x, y)
# ╔═╡ 3b543c80-8f36-49e9-a2fc-a80b6677ad1e
md"Binding is reversible:"
# ╔═╡ 9d2ceb8b-61ea-49c5-b3a4-08b061423e7c
bind(bind(x, y), y) == x
# ╔═╡ 66440162-3b60-4078-bdcf-3426403fcc2f
md"""
### Shifting
Our last operation to create new vectors is a simple unitary one that can create a new HDV from an old one by shifting its elements. For example, this is important when one wants to include positional information (e.g., in a sentence). An easy trick is to perform **shifting** using cyclic permutation. Just move every element of the HDV one or more steps to the right. Elements that fall off are placed back at the beginning.
"""
# ╔═╡ 4e956ea8-fd2d-4fdb-be33-f8987b2bd1e5
shift(x, k=1) = circshift(x, k)
# ╔═╡ b2dc53a5-0bcd-4551-bf10-9ce405e8b651
shift(x)
# ╔═╡ abab4185-4b15-4457-aa63-3c825477ae48
sum(x.==shift(x)) # shifted vector only shares half its elements with the old one
# ╔═╡ a1eedfc9-4ee4-4f0c-8f89-1fe70b71a686
md"""
### Similarity between HDVs
Finally, we need a way to quantify the **similarity** between two HDVs. This will allow us to detect related patterns. A simple way would be to count the number of matching bits, which we have shown to be expected around $N/2$ for two randomly drawn HDVs.
For bipolar HDV, the **cosine similarity** is a good similarity measure:
$$\cos(\mathbf{x}, \mathbf{y}) = \frac{\mathbf{x} \cdot \mathbf{y}}{||\mathbf{x}||\, ||\mathbf{y}||}\,.$$
The cosine similarity always lies between -1 and 1.
"""
# ╔═╡ 8b153389-1da3-47b4-b747-ec252e08d90d
cos(x, y) = dot(x, y) / (norm(x) * norm(y))
# ╔═╡ 59425927-abaa-45a9-a5e8-1619ae806840
cos(x, y) # random vectors, so close to 0
# ╔═╡ c5a8dd2f-b134-4b7b-96f3-3ad3611d92a4
md"""
### Properties of the operations
Bundling, binding, and shifting allow for meaningful computation on the concepts your HDVs represent. One vital property of binding and shifting possess is that they are *reversible*: applying them does not remove any information. Binding also distributes over bundling:
"""
# ╔═╡ 3bd2b8c8-7a57-4080-8861-f231dfa4471e
z = hdv()
# ╔═╡ 3ce916d7-935d-43ee-b5dc-10651f281985
bind(x, bundle(y, z)) == bundle(bind(x, y), bind(x, z))
# ╔═╡ efab3c5b-bb21-486e-8474-6b8440c9e42b
md"This means that transforming a bundle of concepts with binding is equivalent to binding every element before bundling.
Binding also preserves distance and similarity."
# ╔═╡ a7d1326e-250c-4fd7-af2b-2f5e0504f03e
cos(x, y) ≈ cos(bind(x, z), bind(y, z))
# ╔═╡ 8fc6f3a0-e05d-44cc-b3e3-bf35c1d02918
md"Similarly, permutation or shifting also perserves the similarity between two vectors."
# ╔═╡ 432a9fae-1c68-4312-8837-9fe489e26791
cos(x, y) ≈ cos(shift(x), shift(y))
# ╔═╡ b5e16410-82eb-4851-a20d-92106c2654e9
md"""
## The blessing of high dimensions
While some machine learning methods reduce dimensionality, the magic of HDC arises by *expanding* the dimensionality. Feature expansion is far from a novel concept: deep learning does it, kernel methods do it etc. Moving your problem to vast dimensionality can make it easier to solve. You could comfortably fit, for example, all of organic chemistry in such a system. The vast majority of the vectors in such a space are uncorrelated, as shown below.
"""
# ╔═╡ 001f44f6-7e3f-4312-8387-9587f7811fdb
let
p = histogram([cos(hdv(), hdv()) for i in 1:1000], xlims=(-1,1), label="", xlabel="cos. sim.", title="Cosine similarity between randomly chosen HDVs")
vline!(p, [0], label="mean", lw=2)
end
# ╔═╡ 84a318f7-3d7c-4809-91ce-d3388f915b3f
md"""
As you can see, most HDVs are approximately orthogonal to one other. This implies that when two vectors *are* related (share substantially more than 50% of their bits), this is a good indication that they are related semantically! To put a number on this, there is less than a one per cent chance that two random HDVs share more dan 5500 bits, corresponding to a cosine similarity of 0.05. This makes HDC very sensitive to picking up related concepts.
Every vector also has billions of neighbours that differ only a few bits. So, x and y may be both related to z, while x and y themselves are unrelated. This property allows one to build semantic links in the space. Bird and sock might be unrelated, but they can be conceptually linked as bird ~ chicken ~ drumstick ~ leg ~ foot ~ sock.
"""
# ╔═╡ 27e6169f-1bfa-4e59-bf0f-872a432983d5
md"""
## Examples
Let us explore some didactic examples to show how objects, numbers, colours, and sequences can be represented as HDVs and how we can learn with them.
"""
# ╔═╡ c6718aac-f7e9-47b7-9698-d52c31208db9
md"""
### Example 1: colour matching
Let us start with a simple toy example. We have a list of emojis, each with associated colours. Can we find the *average* colour per emoji?
Take a look at the data.
"""
# ╔═╡ 853a5ec0-4a9a-4430-b0f2-b3e7994c25ba
md"""
Colors are a bit more tricky. This is an example color:
"""
# ╔═╡ a193a721-3caa-4069-b53f-3cdb58cb075e
md"""
We see that a colour can be represented by three numbers: the fractions of red, green, and blue. Every value is just a number between 0 and 1. If we can construct an embedding for numbers, we can represent a colour as a *binding* of three numbers.
Representing numbers in a fixed interval $[a, b]$ with HDVs is relatively easy. We first divide the interval into $k$ equal parts. Then, we generate an HDV representing the lower bound of the interval. We replace a fraction of $1/k$ of the previous vector for every step with fresh random bits.
"""
# ╔═╡ ec7cfa4e-d394-4b2a-9a4a-160c080fa341
function range_hdvs(steps)
k = length(steps) - 1
V = hdv(k+1)
for i in 2:k+1
for j in 1:N
V[i,j] = rand() < 1 / k ? -V[i-1,j] : V[i-1,j]
end
end
return V
end
# ╔═╡ 16e9a27f-0275-48b3-bbc3-9394725ec595
md"Let us represent a color chanel from 0 to 1 in steps of 0.05. This resolution should suffice."
# ╔═╡ c6ecd79c-0285-47a2-9c33-af440f5d3325
color_steps = 0:0.05:1
# ╔═╡ 0a1227c0-6557-46e7-8dcb-e7867b1aac94
const reds_hdv = range_hdvs(color_steps)
# ╔═╡ 0f174703-27df-4c07-a03d-95cf482a1c1d
md"Take a look at the correlation. HDVs that represent numbers closer to onother are more similar:"
# ╔═╡ 7976d118-a1e8-43d8-862b-a21a05a7306e
let
k = length(color_steps)
S = [cos(reds_hdv[i,:], reds_hdv[j,:]) for i in 1:k, j in 1:k]
heatmap(color_steps, color_steps, S, title="cosine similarity between range HDVs")
end
# ╔═╡ 6aa8efb1-8705-474d-bd79-a75d78643f6c
md"Repeat for green and blue. It is important we have fresh HDVs so that our channels are orthogonal!"
# ╔═╡ 9c2b8b28-751d-4def-ae1c-9b1bf545f23f
const greens_hdv = range_hdvs(color_steps);
# ╔═╡ d751b589-8c7d-48d5-9548-3fe105b969e8
const blues_hdv = range_hdvs(color_steps);
# ╔═╡ 909a155e-5cd0-4360-ac9c-4761521293db
md"""
A colour is just a triple of the RGB channel (other representations such as hue-value-saturation might be better, but RGB works fine for our purposes). We can create a colour HDV by binding the three channel-HDVs.
"""
# ╔═╡ 51e5be0b-4dfa-40c3-bbda-bd3b44595c86
col_ind(v) = round(Int, v * 20) + 1
# ╔═╡ 5eb33b1a-f91e-481c-893b-7e179a8136e5
encode_col(col) = bind(reds_hdv[col_ind(col.r),:], greens_hdv[col_ind(col.g),:], blues_hdv[col_ind(col.b),:])
# ╔═╡ 3de9f13e-4a6f-4ba0-b2d4-86fa617c7317
md"Try it on our example colour:"
# ╔═╡ aab48f4b-10da-4872-aee2-bec419f867d2
md"""
Mapping from colour to HDV is straightforward. The big question is, can we do the reverse: *going from HDV to the corresponding colour*? This is an inverse problem and is generally difficult to solve. We will deal with the issue by using a simple *table lookup*:
1. generate a list of many random colours;
2. get the HDV for each colour;
3. to map a new HDV $\mathbf{x}$ to a colour, just look for the most similar colour in the database using the cosine similarity and return it.
"""
# ╔═╡ 3f51851d-bfbd-4301-880f-923851305bf5
md"Now that we can encode emoji and colours, we can represents couples with yet another binding operation. Subsequently, we can embed the complete dataset by bundling."
# ╔═╡ 93c47ae3-1fcf-470f-90c6-28dad06631ad
md"From the dataset embedding, we can now extract all the 'average' colours for each emoji. Remember, by binding again with the fire truck, we reverse the initial binding."
# ╔═╡ 8e334f45-08d0-455b-9a99-a1978cb9ae61
md"""
This seems to work quite well! Let us move to a variation to this problem. With every emoji, we now give *three* colours: one is related to the emoji (we don't know which one), and two are randomly generated. Can we still find the matching colour?
"""
# ╔═╡ 4ede9c89-a64a-40f8-898a-22b7edb7d616
md"""
This is an example of a *multi-instance problem*: for every observation, we have several labels, and we know at least one is relevant. It pops up frequently in weakly-supervised learning, for example, when you have an image containing multiple objects with only a label for the image or when a molecule has different configurations, knowing that at least one is chemically active.
For HDC, dealing with multi-instance learning is a breeze. We can bundle the different colours. Due to distributivity, the irrelevant colours will act as noise and will be averaged away when bundling the complete data.
"""
# ╔═╡ 02f703ff-f44f-445d-bfb6-9aa3273c0393
md"Not bad right? Dealing with this issue is quite complex in most machine learning algorithms but HDC gives us the tools to model the structure of our problem."
# ╔═╡ 00b949c3-5986-4f72-a1db-45f5fa320d91
md"""
### Example 2: recipes
Let us move to a real dataset, the recipes dataset of [Ahn et al.](https://www.nature.com/articles/srep00196). It can learn us which regions tend to use which ingredients. Here, there are over 55,000 recipes from all over the world that are represented as bags of ingredients.
"""
# ╔═╡ 2df88a19-6965-4e48-92f9-3725ff86f7ad
md"These are all the ingredients:"
# ╔═╡ 6f8d9c88-1686-4a54-9cf6-ce53c31629a8
md"We pick a random vector for each ingredient:"
# ╔═╡ d3c07d85-90e0-43ec-8468-b7db81629754
md"""
Here, we treat every ingredient as an independent entity. The dataset also contains information on the type of ingredient (fruit, vegetable, dairy...) and its flavour profile. If we feel like it, we could incorporate this information to make a better representation. We can also do some clever things to make ingredients that co-occur in recipes more similar.
Given the HDVs of the ingredients, we can represent a recipe as the bundling of the composing ingredient representations.
"""
# ╔═╡ f32fe25f-e45f-478a-a540-f406e37390da
md"""
Again, this is only the most basic thing we can do! We could also use binding to encode interactions between ingredients, which is, of course, hugely important to have a realistic cooking model.
Finally, we can bundle all 55,000 recipes in a single vector representing the distribution of recipes:
"""
# ╔═╡ 31d654a5-4258-46d9-a61b-1bbde14c21df
md"We can also do this for each of the origins separately. This collects information on ingredient use per region:"
# ╔═╡ 71462659-b977-4323-8ab0-00800ff662bc
md"Let us take a look at how the regions differ in recipe composition!"
# ╔═╡ b92c8919-0051-4be7-8a89-5ae5ccdd2b12
md"A simple PCA gives a graphical representation."
# ╔═╡ df50f82c-f2d1-4a64-854d-8702c313da38
md"""
What are the most essential ingredients for each region? We can match the embedding of each ingredient with the embedding of each region with the cosine similarity. Take a look at the top 10 ingredients per region below.
"""
# ╔═╡ f294ddf4-d6a3-47a0-b2b5-61592ebec2cc
md"We can create embeddings of new recipes and study them."
# ╔═╡ ae5e090e-9a81-4a3f-9c95-f2da12cf7562
md"Does this match the distribution of the true recipes?"
# ╔═╡ 64f6bc4d-73b2-4d5d-baf9-e41a867e44b4
md"As a reference, this is what we get if we match a random set of ingrdients to the master recipe embedding:"
# ╔═╡ b789899d-c95b-4a08-8051-25da7750e7d1
md"We can check which region matches best to our recipe."
# ╔═╡ 45cabcf5-a686-41e5-8b0d-cf55a62df956
md"And finally, we can look for the best ingredient that would complete our recipe. Just add ingredients, one at a time, and compare with the master recipes embedding."
# ╔═╡ 63d69906-35fc-4d19-8cce-d6b6e3774b8f
md"""
### Example 3: protein classification
As the last application, let us deal with some sequence data. To this end, we load 1000 protein sequences: half originating from humans, half from yeast. Can we build a simple classifier to discriminate between the species?
"""
# ╔═╡ 1e178b1c-9ae4-4dff-b140-c37a70d74fac
md"For our purposes, a protein just a sequence of amino acids. Ignoring any physicochemical similarity between them, we assign a random HDV to each HDV."
# ╔═╡ f5e89e20-ef6d-49e0-8fbd-62b7d828d9cf
md"""
We will consider each sequence as a bag of trimers to encode the sequences. We can use binding to create trimer-HDVs from the character vectors, but we must be cautious of retaining the order information since `bind(x, y, z)` is the same as `bind(y, x, z)`. Binding is commutative. We can encode the position information using shifting:
`bind(x, shift(y), shift(shift(z)))`
As there are only $20\times 20 \times 20=8000$ trimers, we can precompute them.
"""
# ╔═╡ 9e3279d9-5769-4b50-83d5-72c77ead7952
md"Then we look at all trimers in a sequence and bundle them."
# ╔═╡ ad4b7552-d7d2-4f7f-aea2-c39d6bcbf80f
asequence = "pelican"
# ╔═╡ b9a758b2-3522-4f5d-8403-b82ff321f9df
for i in 1:length(asequence)-2
println(" "^(i-1) * asequence[i:i+2])
end
# ╔═╡ d89e69ed-7c14-47d7-8aa8-dd07dfa7da18
md"We can take a look at the similarity between sequences and represent this in 2D using PCA."
# ╔═╡ 9008144b-a78c-426e-9cd2-366618cd8bd2
md"Let us build a simple classifier to predict the species. We take 80% of the data for training and 20% for testing. As the data is balanced, it is easy to compare. Let us explore three strategies to 'train' a classifier."
# ╔═╡ 0c5fce59-684c-4a3a-a1f9-795b989d5aa3
train = rand(n_seqs) .< 0.8; # take 80% of the sequences for training
# ╔═╡ 9c0ab3a8-d5d8-4f50-8a16-1d6c9c8582d4
test = .! train;
# ╔═╡ b4394de6-cd0d-4a9b-8350-8433f3b43b4f
md"We can make prototypes for both species."
# ╔═╡ 06eeaea3-44ec-4cc5-8d8a-5f8db17cfefd
md"**Strategy 1**: is a new protein sequence closer to the human of yeast prototype?"
# ╔═╡ 97624ac0-0f33-4798-8f78-70a3ddbc6f0e
md"""
Not too bad for such a simple model. Most applications that use HDC for machine learning use some form of *retraining* they look at datapoints that were wrongly classified and use them to improve the prototypes. Again, here we won't bother. Instead, let us look at a *discriminative* approach.
**Strategy 2**: subtract the sequence embedding of yeast from those of humans to get a difference vector.
"""
# ╔═╡ 202f239b-ee55-4fbc-b36b-30e0a409ada9
md"""
We see that this is a slight improvement. Let us fit a slightly more clever model, using weights for the individual bits.
**Strategy 3**: fit a Naive Bayes model.
Given the label, we assume that all the HDV bits are independent. Because the information is distributed over the whole vector, it is not a strange assumption to make. For any given bit, we can estimate the probability of it being positive, given that it is either human or yeast. Computing the log ratio for every position
$$\log\left(\frac{\frac{p^+_{human}}{1-p^+_{human}}}{\frac{p^+_{yeast}}{1-p^+_{yeast}}}\right)\,,$$
Where $p^+_{human}$ is the probability that the bit is positive at a specific position in the human HDVsfor humans. Adding all these quantities for the different elements should give a pretty good global log-likelihood ratio.
"""
# ╔═╡ 959010fe-9ae1-4a47-8e62-1b867a8c3b9b
md"""
The Naive Bayes approach seems to work even better here!
We can look at which trimers agree best with our weight vector. This might be useful the discover which properties are important for making the prediction.
"""
# ╔═╡ 3969cdaa-0f7a-47f7-9061-b8bc35fc103b
md"Again, one can add many improvements to our sequence model."
# ╔═╡ f91210a1-fa3f-4894-ae9d-e15ec62439c6
md"""
## Comparision with other methods
### With deep learning
HDC seems to complement deep learning quite well. For 1D sequence-like data, HDC has been shown to outcompete convolutional neural networks. The latter still delivers superior performance for 2D image-like data at the expense of much more computation. I got intrigued by HDC by [a paper](https://arxiv.org/abs/2106.02894) that showed it works about as good as graph convolutional neural networks for predicting drug-like properties while taking mere minutes to train. HDC is an energy-efficient alternative to deep learning, with potential in fields such as [robotics](https://www.science.org/doi/10.1126/scirobotics.aaw6736).
HDC allows for more straightforward incorporation of prior knowledge and can handle specific problems more efficiently, such as the multi-instance setting. It is also reasonably data-efficient. I think it has tremendous potential for challenges such as causal inference.
"""
# ╔═╡ 6d82ba5f-40db-4df8-af41-0587cedacd74
md"""
### With kernel methods
HDC evokes strong parallels to kernel-based methods. Both work by representing the objects in some high-dimensional feature space. Kernel methods perform an *implicit* feature mapping to Hilbert space, accessed through dot-products in this space. One obtains powerful nonlinear models by replacing dot products with kernel evaluations in linear learning algorithms such as SVM or PCA.
HDC creates *explicit* and often discrete feature spaces. Rather than relying on linear algebra and linear learning methods, one typically uses nearest-neighbour-like algorithms, prototypes and the like. However, this seems to be more of a cultural choice, as the former can certainly be used in tandem with HDVs. The explicit feature space makes HDVs easier to manipulate than objects in Hilbert space. An added advantage (at least in my point of view) is that HDC is much easier to explain to the non-expert. HDC seems to shine for discrete problems and is slightly clumsy in modeling real-valued objects compared to kernel methods.
"""
# ╔═╡ d6ddd663-8069-4b1c-8af9-2699411343ea
md"""
### RandNLA
Hyperdimensional computing also evokes the ideas discussed in [randomized linear algebra](https://cacm.acm.org/magazines/2016/6/202647-randnla/fulltext) (RandNLA), which uses random sampling and projection to make certain linear algebra computations such as solving a system or computing the eigenvalues easier. RandNLA also distributes the information over all the matrix dimensions, improving their properties, such as the condition number. Matrices containing HDVs are likely to be well-conditioned, as all the vectors "look the same", having very similar norms etc.
"""
# ╔═╡ 5f584667-d3f0-4865-ac7c-23db4cdc7b71
md"""
## Further reading
> Kanerva, P. (2009) *Hyperdimensional computing: An introduction to computing in distributed representation with high-dimensional random vectors* Cognitive Computation
A great introduction to the idea and how to compute with HDVs. The author's [slide deck](https://web.stanford.edu/class/ee380/Abstracts/171025-slides.pdf) is very accessible.
> [Hyperdimensional computing and its role in AI](https://medium.com/dataseries/hyperdimensional-computing-and-its-role-in-ai-d6dc2828e6d6)
An outstanding blog post.
> Hassan et al. (2021) *Hyper-dimensional computing challenges and opportunities for AI applications* IEEE Access
Recent review and comparison with deep learning.
> Mitrokhin et al. *Learning sensorimotor control with neuromorphic sensors: Toward hyperdimensional active perception* Science Robotics
Case study of HDC in robotics. Though I am not really convinced of their scheme for encoding semantic meaning in the HDV, it explains the basic concepts of HDC quite well.
"""
# ╔═╡ 8111b866-9233-457d-ac6f-68b5dd766afa
md"""
## Appendix
This appendix contains some helper functions and code to generate the three examples.
"""
# ╔═╡ 9899dc83-4354-4217-9c0e-bbaa3db976b2
"""Get the top-10 elements from a list of (key, value), by value."""
top10(list) = sort!(list, rev=true, by=t->t[2])[1:10];
# ╔═╡ 7548130b-952b-41ab-8245-22da81a3c6be
md"""
### Generating the colour data
"""
# ╔═╡ fe25d401-c7f2-4657-bdad-ec393f0bcd5e
"""Randomly draw a color from the RGB-space"""
randcol() = RGB(rand(), rand(), rand());
# ╔═╡ b4655d6c-2169-4336-86c4-f97d55eca319
acolor = randcol()
# ╔═╡ a1861373-574f-452b-9a85-2772ddf0e585
acolor.r, acolor.g, acolor.b
# ╔═╡ c55f2c0c-f643-4ca8-8060-0a56f67cd1e8
colhdv = encode_col(acolor)
# ╔═╡ 5d3da98f-9f6c-4f8b-9229-7613cc460a2e
const ref_colors = [randcol() for i in 1:1000] .|> c -> (color=c, hdv=encode_col(c))
# ╔═╡ 425a47da-c33d-4f52-9f9e-e1429129fb77
decode_colors(v) = argmax(((c, cv),) -> cos(v, cv), ref_colors) |> first
# ╔═╡ 3a762ac8-4d6a-48a3-ac32-dd1f488aacfe
decode_colors(colhdv) # we more or less recover the color!
# ╔═╡ 291b6f8c-038f-4a1c-a8d8-69d0a99a17de
decode_colors(hdv()) # decoding a random HDV
# ╔═╡ 75c22e04-c70a-43b0-a0f2-87cd6053adaa
begin
# collect all colors
reds = [RGB((c./255)...) for (n, c) in Colors.color_names if occursin("red", n)]
blues = [RGB((c./255)...) for (n, c) in Colors.color_names if occursin("blue", n)]
greens = [RGB((c./255)...) for (n, c) in Colors.color_names if occursin("green", n)]
oranges = [RGB((c./255)...) for (n, c) in Colors.color_names if occursin("orange", n)]
greys = [RGB((c./255)...) for (n, c) in Colors.color_names if occursin("grey", n)]
yellows = [RGB((c./255)...) for (n, c) in Colors.color_names if occursin("yellow", n)]
whites = [RGB((c./255)...) for (n, c) in Colors.color_names if occursin("white", n)]
end;
# ╔═╡ d1f8305e-fc31-4f16-904b-44e409c570c5
emojis_colors = Dict(:🚒 => reds, :💦 => blues, :🌱 => greens, :🌅 => oranges, :🐺 => greys, :🍌 => yellows, :🥚 => whites)
# ╔═╡ 93e2398c-f423-4e67-a8e1-87f0b304ff83
emojis = collect(keys(emojis_colors))
# ╔═╡ 4b63258f-c759-4f89-8949-272d7b596bc0
md"Encoding emojis is easy! There are only $(length(emojis)) of them. Let us give them all a unique HDV."
# ╔═╡ bf0286e1-ec61-4799-b068-8286916290d5
emojis
# ╔═╡ d5f14066-1773-4dd7-93fd-d8d70bba6c04
emojis_hdvs = Dict(s => hdv() for s in emojis)
# ╔═╡ 42a4f962-a3f4-464c-9af7-abe1168738cb
encode_emoji_col_pair((s, c)) = bind(emojis_hdvs[s], encode_col(c))
# ╔═╡ e9135797-b217-4833-8d69-b06615ebb10c
encode_shape_col_pair2((s, colors)) = bind(emojis_hdvs[s], bundle(encode_col.(colors)...))
# ╔═╡ 33187ee1-26ca-445a-8dd4-e6c752e4ddcd
toy_data1 = [rand(emojis) |> l->(l, rand(emojis_colors[l])) for i in 1:100]
# ╔═╡ 845d7044-e9ae-4e83-bc39-969cd49405e4
toy_data1
# ╔═╡ eb2e1852-79f9-45fe-a6f9-818e56aa0f5f
col_emoji_hdvs = encode_emoji_col_pair.(toy_data1)
# ╔═╡ 3d50923f-e5c0-4a8c-809c-4856068756b9
toy_data_emb = bundle(col_emoji_hdvs...)
# ╔═╡ c6a50c5f-3702-4c64-b47b-925189ebcfbe
bind(toy_data_emb, emojis_hdvs[:🚒]) # embedding of color of the truck
# ╔═╡ c021ab9a-a097-4994-917b-74c767189c2d
decode_colors(bind(toy_data_emb, emojis_hdvs[:🚒]))
# ╔═╡ 7bf217f7-639b-46c7-8565-4cdbf82bba26
decode_colors(bind(toy_data_emb, emojis_hdvs[:💦]))
# ╔═╡ 4c557c67-aa46-416e-a31a-c791dc954a9d
decode_colors(bind(toy_data_emb, emojis_hdvs[:🌱]))
# ╔═╡ ac580e2a-c2bf-4243-8502-2263a4e8d1f4
decode_colors(bind(toy_data_emb, emojis_hdvs[:🍌]))
# ╔═╡ 30c0e722-9457-4609-943f-c7b837b45151
toy_data2 = [rand(emojis) |> l->(l, shuffle!([rand(emojis_colors[l]), randcol(), randcol()])) for i in 1:500]
# ╔═╡ 4c417b6d-97ad-4176-bb7f-4ccb14bdc6cf
toy_data2
# ╔═╡ eb1e08c1-72e6-46dd-936e-0f7b09dbaa29
toy_data_emb2 = bundle(encode_shape_col_pair2.(toy_data2)...)
# ╔═╡ 57593d22-5be6-484a-93b9-323ae0d982cc
decode_colors(bind(toy_data_emb2, emojis_hdvs[:🚒]))
# ╔═╡ 39ce810c-bcc8-4ed8-b349-eef5c11b1b1a
decode_colors(bind(toy_data_emb2, emojis_hdvs[:💦]))
# ╔═╡ 860deb07-0ddd-4b04-96c8-e67bd3f5e5b6
decode_colors(bind(toy_data_emb2, emojis_hdvs[:🌱]))
# ╔═╡ ecbcf732-9459-425f-aa51-be9d5a32667a
decode_colors(bind(toy_data_emb2, emojis_hdvs[:🍌]))
# ╔═╡ c1878a27-fb4e-4315-a922-1e3f45844b32
md"""
### Loading the recipes data
"""
# ╔═╡ f642e17c-ef79-4fd0-86df-5155bb44284c
recipes_data = CSV.read("data/Recipes_with_origin.csv", DataFrame)[:,2:end];
# ╔═╡ 4284e904-5541-4568-bc9f-0547f65741e5
recipes_data
# ╔═╡ d8405623-6eef-45d4-857d-5bf9793957e5
begin
ingredients = names(recipes_data)[1:end-11]
n_recipes = size(recipes_data, 1)
ingr2ind = Dict(ingr=>i for (i,ingr) in enumerate(ingredients))
origins = recipes_data[:,end-10:end]
regions = names(origins)
recipes = sparse(Matrix(recipes_data[:,1:end-11]))
end;
# ╔═╡ d597efc2-ce62-45bc-b0e5-9818730566e2
ingredients
# ╔═╡ 026a628a-f9fd-44ac-a2cb-814c2271a8c4
Xingr = hdv(length(ingredients))
# ╔═╡ 29b61795-178e-4258-b735-79b29abb8d72
Xrec = recipes * Xingr .|> sign
# ╔═╡ 1b94e0dc-3dc7-43a0-bb5c-58bcf53ccc00
recipes_hdv = sum(Xrec, dims=1)[:] .|> sign
# ╔═╡ 371a69e4-7e1c-42c4-b604-eef6702c699b
Xorigin = Matrix(origins)' * Xrec .|> sign
# ╔═╡ 7fc3ebac-0e69-4f8b-b140-daa726a67f60
C_or = Xorigin * Xorigin';
# ╔═╡ e45bb41b-aa53-402d-99ac-332dd5be4287
DataFrame([regions C_or], ["origin", regions...])
# ╔═╡ 2bf5b6f7-4d2b-4610-aace-d8b0da82e670
let
# simple PCA
C̃ = C_or .- mean(C_or, dims=1) .- mean(C_or, dims=2) .+ mean(C_or)
Λ, V = eigen(C̃)
p = scatter(V[:,end], V[:,end-1], label="", color="black")
annotate!(p, V[:,end], V[:,end-1], regions)
p
end
# ╔═╡ 081e33de-c308-416c-b081-1a564f52089f
DataFrame(
Dict(r => [(ingr, cos(Xingr[i,:], Xorigin[j,:])) for (i, ingr) in enumerate(ingredients)] |> top10
for (j, r) in enumerate(regions)))
# ╔═╡ f233f712-832d-474d-8994-bdd0c478cab1
embed_ingr(ingr) = Xingr[ingr2ind[ingr], :]
# ╔═╡ 8cde2df6-8259-4884-b0fd-aac1178077a5
encode_recipe(recipe) = bundle(embed_ingr.(recipe)...)
# ╔═╡ 9ae32d1b-1e06-4ddf-9db2-f9fbb2cbbbbc
my_recipe_hdv = encode_recipe(["wine", "butter", "lemon peel", "chicken", "black pepper", "cheese"])
# ╔═╡ 4ae2c7ce-602b-4fce-9d39-5d4adfd1a584
cos(my_recipe_hdv, recipes_hdv)
# ╔═╡ 8317b9af-dcb3-45bf-8419-77d6d422a4eb
rand_ingr = rand(ingredients, 5)
# ╔═╡ b46651f5-2e8d-4d3c-bb84-8a85f1d788f0
fake_rec_hdv = encode_recipe(rand_ingr)
# ╔═╡ 63da4716-384a-470f-9758-cd3c46229813
cos(fake_rec_hdv, recipes_hdv) # lower!
# ╔═╡ e87efdc5-c0f2-40c8-b5d0-476888b91ef6
zip(regions, Xorigin * my_recipe_hdv) |> collect |> top10
# ╔═╡ cadfce71-ec41-440d-ac06-ce2e4fa59818
[(ingr, cos(recipes_hdv, Xingr[i,:] .* my_recipe_hdv)) for (i, ingr) in enumerate(ingredients)] |> top10
# ╔═╡ cfb7d15f-7731-4b7e-8868-e6d428af0251
md"""
### Loading the protein data
"""
# ╔═╡ 4bd3b204-c86f-4db1-818b-7cc3b239ac1c
amino_acids = mapreduce(unique, union, sequences) |> sort!
# ╔═╡ 3382d97d-7df0-4fd3-9243-104a61db8335
amino_acids
# ╔═╡ 167861f2-0ad7-444a-89d7-762e34f5fe00
const aa_hdvs = Dict(aa=>hdv() for aa in amino_acids)
# ╔═╡ 7b030f8e-9a10-438a-b702-9b29ee0250c3
const trimer_hdvs = Dict(aa1 * aa2 * aa3 =>
bind(aa_hdvs[aa1], shift(aa_hdvs[aa2]), shift(aa_hdvs[aa3], 2))
for aa1 in amino_acids for aa2 in amino_acids for aa3 in amino_acids)
# ╔═╡ 1fc29d2b-4694-4794-8191-3d4dfbbfcbf7
function embed_sequences(sequences)
# preallocate an empty matrix
hdvs = zeros(Int, length(sequences), N)
for (i, seq) in enumerate(sequences)
v = @view hdvs[i,:] # ref to hdv i
for pos in 1:length(seq)-2
trimer = seq[pos:pos+2]
v .+= trimer_hdvs[trimer]
end
v .= sign.(v)
end
return hdvs
end
# ╔═╡ 40605a9d-5fcd-45e4-b953-72b618fc239b
Xseq = embed_sequences(sequences)
# ╔═╡ c7a54d09-d61f-4e6b-bf53-4732d3b58e09
let
Cseq = Xseq * Xseq'
# simple PCA
C̃ = Cseq .- mean(Cseq, dims=1) .- mean(Cseq, dims=2) .+ mean(Cseq)
Λ, V = eigen(C̃)
p = scatter(V[humane,end], V[humane,end-1], label="human", color="blue", alpha=0.7)
scatter!(V[.!humane,end], V[.!humane,end-1], label="yeast", color="orange", alpha=0.7)
p
end
# ╔═╡ 43586fa7-2e82-4eac-a330-fa9acefb18dc
human_hdv = bundle(Xseq[train.&humane,:])
# ╔═╡ 7e4bf5ae-05cd-4e93-9b6f-daa0b71d89d7
yeast_hdv = bundle(Xseq[train.&.!humane,:])
# ╔═╡ 204bb552-9cb3-447c-820d-c0030db016c9
cos(human_hdv, yeast_hdv) # prototypes are related, as would be expected
# ╔═╡ 0833f9bd-0d3f-4379-9a84-8c8895b5f86c
predict_sp(x) = cos(human_hdv, x) > cos(yeast_hdv, x) ? "HUMAN" : "YEAST"
# ╔═╡ 540ff878-8b30-4f39-b8c6-8de47938c082
predictions = predict_sp.(eachrow(Xseq[test,:]))
# ╔═╡ 83542647-f2fb-4cd8-b7ac-fcbd18e99629
mean(species[test] .== predictions)
# ╔═╡ 563f0d4e-176b-4e0f-a3a5-f3e2b30ccd4f
hdv_diff = sum(Xseq[train.&humane,:], dims=1)[:] -
sum(Xseq[train.&.!humane,:], dims=1)[:] #.|> sign
# ╔═╡ b745a8c1-1b40-4e89-a2f2-6023c1224b4f
predict_sp2(x) = cos(hdv_diff, x) > 0 ? "HUMAN" : "YEAST"
# ╔═╡ ee157137-78c1-4cfe-a5b1-39408d05b900
predictions2 = predict_sp2.(eachrow(Xseq[test,:]))
# ╔═╡ 43218d25-082e-44e4-bd06-d1eaf7ae5dc4
mean(species[test] .== predictions2)
# ╔═╡ 312c04ef-9a88-4f2e-ac9d-0367e74c1b2c
P₊human = mean(>(0), Xseq[train.&humane,:], dims=1)[:]
# ╔═╡ b81e0880-90f1-4494-a537-a8a521937064
P₊yeast = mean(>(0), Xseq[train.&.!humane,:], dims=1)[:]
# ╔═╡ 271c915d-050c-410a-9a87-52762e62f9c9
θ = @. (log(P₊human) - log(1-P₊human)) - (log(P₊yeast) - log(1-P₊yeast))
# ╔═╡ 51f12893-09e7-4483-b558-b5d6450720cd
predict_sp3(x) = dot(θ, x) > 0 ? "HUMAN" : "YEAST"
# ╔═╡ 7fd5b9bd-3e7e-41b0-9a7d-a6bb6cc4220d
predictions3 = predict_sp3.(eachrow(Xseq[test,:]))
# ╔═╡ 2c80eacd-2265-40e3-a2a6-80977e15af66
mean(species[test] .== predictions3)
# ╔═╡ fc14fadf-8bea-4a33-9faf-7149edba5db9
[(trimer, cos(v, θ)) for (trimer, v) in trimer_hdvs] |> top10
# ╔═╡ 00000000-0000-0000-0000-000000000001
PLUTO_PROJECT_TOML_CONTENTS = """
[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
PlutoUI = "7f904dfe-b85e-4ff6-b463-dae2292396a8"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
[compat]
CSV = "~0.10.4"
Colors = "~0.12.8"
DataFrames = "~1.3.4"
FASTX = "~2.0.0"
Plots = "~1.32.0"
PlutoUI = "~0.7.40"
StatsBase = "~0.33.21"
"""
# ╔═╡ 00000000-0000-0000-0000-000000000002
PLUTO_MANIFEST_TOML_CONTENTS = """
# This file is machine-generated - editing it directly is not advised
[[AbstractPlutoDingetjes]]
deps = ["Pkg"]
git-tree-sha1 = "8eaf9f1b4921132a4cff3f36a1d9ba923b14a481"
uuid = "6e696c72-6542-2067-7265-42206c756150"
version = "1.1.4"
[[Adapt]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "195c5505521008abea5aee4f96930717958eac6f"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.4.0"
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[Automa]]
deps = ["Printf", "ScanByte", "TranscodingStreams"]
git-tree-sha1 = "d50976f217489ce799e366d9561d56a98a30d7fe"
uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
version = "0.8.2"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BioGenerics]]
deps = ["TranscodingStreams"]
git-tree-sha1 = "0b581906418b93231d391b5dd78831fdc2da0c82"
uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea"
version = "0.1.2"
[[BioSequences]]
deps = ["BioSymbols", "Random", "Twiddle"]
git-tree-sha1 = "523d40090604deae32078e3bf3d8570ab1cb585b"
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
version = "3.1.0"
[[BioSymbols]]
git-tree-sha1 = "6f59deb6e86841a75188721c567fad81fbc305f1"
uuid = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
version = "5.1.1"
[[BitFlags]]
git-tree-sha1 = "84259bb6172806304b9101094a7cc4bc6f56dbc6"
uuid = "d1d4a3ce-64b1-5f1a-9ba4-7e7e69966f35"
version = "0.1.5"
[[Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+0"
[[CSV]]
deps = ["CodecZlib", "Dates", "FilePathsBase", "InlineStrings", "Mmap", "Parsers", "PooledArrays", "SentinelArrays", "Tables", "Unicode", "WeakRefStrings"]
git-tree-sha1 = "873fb188a4b9d76549b81465b1f75c82aaf59238"
uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
version = "0.10.4"
[[Cairo_jll]]
deps = ["Artifacts", "Bzip2_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"]
git-tree-sha1 = "4b859a208b2397a7a623a03449e4636bdb17bcf2"
uuid = "83423d85-b0ee-5818-9007-b63ccbeb887a"
version = "1.16.1+1"
[[ChainRulesCore]]
deps = ["Compat", "LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "dc4405cee4b2fe9e1108caec2d760b7ea758eca2"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.15.5"
[[ChangesOfVariables]]
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
git-tree-sha1 = "38f7a08f19d8810338d4f5085211c7dfa5d5bdd8"
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
version = "0.1.4"
[[CodecZlib]]
deps = ["TranscodingStreams", "Zlib_jll"]
git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
version = "0.7.0"
[[ColorSchemes]]
deps = ["ColorTypes", "ColorVectorSpace", "Colors", "FixedPointNumbers", "Random"]
git-tree-sha1 = "1fd869cc3875b57347f7027521f561cf46d1fcd8"
uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
version = "3.19.0"
[[ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.11.4"
[[ColorVectorSpace]]
deps = ["ColorTypes", "FixedPointNumbers", "LinearAlgebra", "SpecialFunctions", "Statistics", "TensorCore"]
git-tree-sha1 = "d08c20eef1f2cbc6e60fd3612ac4340b89fea322"
uuid = "c3611d14-8923-5661-9e6a-0046d554d3a4"
version = "0.9.9"
[[Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
git-tree-sha1 = "417b0ed7b8b838aa6ca0a87aadf1bb9eb111ce40"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.8"
[[Compat]]
deps = ["Dates", "LinearAlgebra", "UUIDs"]
git-tree-sha1 = "5856d3031cdb1f3b2b6340dfdc66b6d9a149a374"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "4.2.0"
[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
[[Contour]]
git-tree-sha1 = "d05d9e7b7aedff4e5b51a029dced05cfb6125781"
uuid = "d38c429a-6771-53c6-b99e-75d170b6e991"
version = "0.6.2"
[[Crayons]]
git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "4.1.1"
[[DataAPI]]
git-tree-sha1 = "1106fa7e1256b402a86a8e7b15c00c85036fef49"