-
Notifications
You must be signed in to change notification settings - Fork 37
/
lecture_13-content.js
221 lines (221 loc) · 55 KB
/
lecture_13-content.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 9}], "Lecture 11: overview of different services (e.g., GitHub), datasets (C4), processing methods (CCNet)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 10}], "Lecture 12: mechanics of learned data filtering (KenLM, fastText, DSIR)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 12}], "This lecture:", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 29}], "Deduplication: given a training corpus", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 31}], "Two types of duplicates", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 32}], "- Exact duplicates (mirror sites, GitHub forks)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 32}], "https://www.gutenberg.org/MIRRORS.ALL", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 33}], "- Near duplicates: same text differing by a few tokens", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 35}], "Examples of near duplicates", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 36}], "- Terms of service and licenses", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 36}], "https://opensource.org/license/mit", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 37}], "- Formulaic writing (copy/paste or generated from template)", {})
addImage([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 37}], "https://d3i71xaburhd42.cloudfront.net/4566c0d22ebf3c31180066ab23b6c445aeec78d5/5-Table1-1.png", {"width": "100.0%"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 38}], "- Minor formatting differences in copy/pasting", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 40}], "Product description repeated 61,036 times in C4", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 41}], "'\u201cby combining fantastic ideas, interesting arrangements, and follow the current trends in the field of that make you more inspired and give artistic touches. We\u2019d be honored if you can apply some or all of these design in your wedding. believe me, brilliant ideas would be perfect if it can be applied in real and make the people around you amazed!", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 46}], "https://www.amazon.co.uk/suryagede-100-Graffiti-Gas-Mask/dp/B07CRHT3RG", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 47}], "https://apkpure.com/100-graffiti-gas-mask/com.GraffitiGasMask.suryagede", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 49}], "Deduplication training data makes language models better [Lee+ 2022]", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 49}], "https://arxiv.org/pdf/2107.06499", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 50}], "- Train more efficiently (because have fewer tokens)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 51}], "- Avoid memorization (can mitigate copyright, privacy concerns)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 53}], "Design space", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 54}], "1. What is an item (sentence, paragraph, document)?", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 55}], "2. How to match (exact match, existence of common subitem, fraction of common subitems)?", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 56}], "3. What action to take (remove all, remove all but one)?", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 58}], "Key challenge:", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 59}], "- Deduplication is fundamentally about comparing items to other items", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 60}], "- Need linear time algorithms to scale", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 62}, {"name": "hash_functions", "filename": "lecture_13.py", "lineno": 73}], "Hash function h maps item to a hash value (integer or string)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 62}, {"name": "hash_functions", "filename": "lecture_13.py", "lineno": 74}], "Hash value much smaller than item", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 62}, {"name": "hash_functions", "filename": "lecture_13.py", "lineno": 75}], "Hash collision: h(x) = h(y) for x \u2260 y", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 62}, {"name": "hash_functions", "filename": "lecture_13.py", "lineno": 77}], "Tradeoff between efficiency and collision resistance", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 62}, {"name": "hash_functions", "filename": "lecture_13.py", "lineno": 78}], "- Cryptographic hash functions (SHA-256): collision resistant, slow (used in bitcoin)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 62}, {"name": "hash_functions", "filename": "lecture_13.py", "lineno": 79}], "- DJB2, MurmurHash, CityHash: not collision resistant, fast (used for hash tables)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 62}, {"name": "hash_functions", "filename": "lecture_13.py", "lineno": 84}], "https://softwareengineering.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 88}], "## C4", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 88}], "https://arxiv.org/pdf/1910.10683v4", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 89}], "1. Item: 3-sentence spans", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 90}], "2. Exact match", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 91}], "3. Remove all but one", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 93}], "Warning: when a 3-sentence span is removed from the middle of a document, the resulting document might lose coherence", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 96}], "## Simple example", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 97}], "1. Item: string", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 98}], "2. Exact match", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 99}], "3. Remove all but one", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 109}], ["hi", "bye", "hello", "hello there", "Hello!"], {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 111}], "Pro: simple, clear semantics, high precision", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 112}], "Con: does not deduplicate near duplicates", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 64}, {"name": "exact_deduplication", "filename": "lecture_13.py", "lineno": 114}], "This code is written in a MapReduce way, can easily parallelize", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 118}], "Goal: efficient, approximate data structure for testing set membership", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 120}], "Features of Bloom filters", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 121}], "- Memory efficient", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 122}], "- Can update, but can't delete", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 123}], "- If return 'no', definitely 'no'", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 124}], "- If return 'yes', most likely 'yes', but small probability of 'no'", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 125}], "- Can drive the false positive rate down exponentially with more time/compute", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 129}], "First, make the range of hash function small.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 136}], "Problem: false positives for small bins", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 138}], "Naive solution: increase the number of bins", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 139}], "Error probability is O(1/num_bins), decreases polynomially with memory", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 141}], "Better solution: use more hash functions", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 147}], "## False positive rate", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 149}], "Assume independence of hash functions and bits", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 149}], "https://en.wikipedia.org/wiki/Bloom_filter", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 162}], "Optimal value of k (given fixed m / n ratio)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 165}], "Resulting false positive rate", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 168}], "Tradeoff between compute (k), memory (m), and false positive rate (f)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 169}], "https://people.eecs.berkeley.edu/~daw/teaching/cs170-s03/Notes/lecture10.pdf", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 171}], "Example: Dolma", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 172}], "- Set false positive rate to 1e-15", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 65}, {"name": "bloom_filter", "filename": "lecture_13.py", "lineno": 173}], "- Perform on paragraphs", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 212}], "Definition: two items are near duplicates if they share an n-gram [Lee+ 2022]", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 213}], "https://arxiv.org/pdf/2107.06499", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 215}], "Example of two phrases that share a 3-gram", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 216}], "- the cat in the hat", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 217}], "- the dog in the hat", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 219}], "Deduplicating training data makes language models better [Lee+ 2022]", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 220}], "1. Item: document", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 221}], "2. Share an n-gram (for n = 50 using BPE tokenization)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 222}], "3. Remove all but one n-gram (but keep the rest of the document)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 224}], "Naive solution: map each n-gram to list of documents containing it", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 226}], "Slicker solution: suffix arrays", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 228}], "Suffix array is a data structure that stores all suffixes of a string S", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 229}], "- O(|S|) time to build", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 230}], "- Only 8 bytes of memory per element of S", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 238}], "Suffix array", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "<|endoftext|> the dog in the hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "cat in the hat <|endoftext|> the dog in the hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "dog in the hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "hat <|endoftext|> the dog in the hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "in the hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "in the hat <|endoftext|> the dog in the hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "the cat in the hat <|endoftext|> the dog in the hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "the dog in the hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "the hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 240}], "the hat <|endoftext|> the dog in the hat", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 67}, {"name": "suffix_arrays", "filename": "lecture_13.py", "lineno": 242}], "To find documents with shared n-grams, simply look at adjacent documents and compute the longest n", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 247}], "## Jaccard similarity", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 249}], "Jaccard similarity: size of intersection divided by size of union", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 259}], "Definition: two documents are near duplicates if their Jaccard similarity is above a certain threshold", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 262}], "Algorithmic challenge: find near duplicates in linear time", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 264}], "## MinHash", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 266}], "MinHash: a random hash function h so that Pr[h(A) = h(B)] = Jaccard(A, B)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 269}], "Normally, you want different items to hash to different hashes, but here, you want collision probability to depend on similarity", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 275}], "Characteristic matrix representation:", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 276}], " | A | B", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 277}], "1 | 1 | 0", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 278}], "2 | 1 | 0", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 279}], "3 | 1 | 0", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 280}], "4 | 1 | 0", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 281}], "5 | 0 | 1", {"font-family": "monospace", "white-space": "pre"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 283}], "Random hash function induces a permutation over items", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 284}], "If 1, 2, 3 is first (min), then hash matches", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 285}], "If 4, 5 is first (min), then hash doesn't matches", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 293}], "We have reduced the footprint of an item from set size to n", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 294}], "However, recall our goal was to find (A, B) with Jaccard(A, B) > threshold.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 68}, {"name": "jaccard_minhash", "filename": "lecture_13.py", "lineno": 295}], "Do we still have to iterate over all pairs?", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 299}], "Locality sensitive hashing (LSH)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 301}], "Goal: hash similar items together", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 302}], "More precisely: have A and B collide if Jaccard(A, B) > threshold", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 304}], "Suppose we hash examples just one MinHash function", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 305}], "P[A and B collide] = Jaccard(A, B)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 306}], "On average, more similar items will collide, but very stochastic...", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 308}], "Solution: use n hash functions", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 309}], "Break up into b bands of r hash functions each (n = b * r)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 314}], "Hash functions: h1 h2 h3 h4 | h5 h6 h7 h8 | h9 h10 h11 h12", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 316}], "Key: A and B collide if for *some* band, *all* its hash functions return same value", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 317}], "As we will see, the and-or structure of the bands sharpens the threshold", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 319}], "Given Jaccard(A, B), what is the probability that A and B collide?", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 326}], "An example", {})
addImage([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 328}], "https://cdn.sanity.io/images/vr8gru94/production/b470799575b8e77911bacb8500977afef06d6c85-1280x720.png", {"width": "100.0%"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 330}], "Increasing r sharpens the threshold, moves the curve to the right (harder to match)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 332}], "---", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 335}], "sim=0.7: P(collison) = 0.015838061136010495", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 335}], "sim=0.75: P(collison) = 0.061549358914366414", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 335}], "sim=0.8: P(collison) = 0.2069934794290924", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 335}], "sim=0.85: P(collison) = 0.546433539626784", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 335}], "sim=0.9: P(collison) = 0.9251697173501179", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 335}], "sim=0.95: P(collison) = 0.9998606445935663", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 335}], "sim=0.98: P(collison) = 0.9999999997289734", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 337}], "---", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 340}], "sim=0.7: P(collison) = 0.00045069029471334066", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 340}], "sim=0.75: P(collison) = 0.0035655888941164005", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 340}], "sim=0.8: P(collison) = 0.024469778036042777", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 340}], "sim=0.85: P(collison) = 0.14204227824907933", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 340}], "sim=0.9: P(collison) = 0.5795029568575274", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 340}], "sim=0.95: P(collison) = 0.9920312678332026", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 340}], "sim=0.98: P(collison) = 0.9999998584279841", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 342}], "Increasing b moves the curve to the left (easier to match)", {})
addImage([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 343}], "https://cdn.sanity.io/images/vr8gru94/production/aace49fa240778e8ecf6e85ad08a2de7f5385566-1280x720.png", {"width": "100.0%"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 345}], "What is the threshold?", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 352}], "Example setting [Lee+ 2022]: n = 9000, b = 20, r = 450", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 354}], "References", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 14}, {"name": "deduplication", "filename": "lecture_13.py", "lineno": 69}, {"name": "locality_sensitive_hashing", "filename": "lecture_13.py", "lineno": 354}], "http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 358}], "Lots of lawsuits around generative AI, mostly around copyright", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 359}], "https://www.bakerlaw.com/services/artificial-intelligence-ai/case-tracker-artificial-intelligence-copyrights-and-class-actions/", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 361}], "## Intellectual property law", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 363}], "Goal: *incentivize* the creation of intellectual goods", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 364}], "Types of intellectual property: copyright, patents, trademarks, trade secrets.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 366}], "## Copyright law", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 368}], "Goes back to 1709 in England (Statute of Anne), first time regulated by governments and courts", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 369}], "https://en.wikipedia.org/wiki/Statute_of_Anne", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 371}], "In United States, most recent: Copyright Act of 1976", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 371}], "https://en.wikipedia.org/wiki/Copyright_Act_of_1976", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 373}], "Copyright protection applies to 'original works of authorship fixed in any tangible medium of expression, now known or later developed, from which they can be perceived, reproduced, or otherwise communicated, either directly or with the aid of a machine or device'", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 378}], "Original works, so collections not copyrightable (e.g., telephone directories) unless there is some creativity in the selection or arrangement", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 380}], "Copyright applies to expression, not ideas (e.g., quicksort)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 382}], "Expanded scope from 'published' (1909) to 'fixed'", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 383}], "Registration not required for copyright protection (in contrast with patents)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 384}], "Threshold for copyright is extremely low (e.g., your website)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 386}], "Registration is required before creator can sue someone for copyright infringement", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 387}], "Costs $65 to register", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 387}], "https://www.copyright.gov/about/fees.html", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 389}], "Lasts for 75 years, and then the copyright expires and it becomes part of the public domain (works of Shakespeare, Beethoven, most of Project Gutenberg, etc.)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 392}], "Summary: most things on the Internet are actually copyrighted.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 394}], "How to use a copyrighted work:", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 395}], "1. Get a license for it.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 396}], "2. Appeal to the fair use clause.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 398}], "## Licenses", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 400}], "A license (from contract law) is granted by a licensor to a licensee.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 401}], "Effectively, 'a license is a promise not to sue'.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 403}], "The Creative Commons license, enable free distribution of copyrighted work.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 404}], "Examples: Wikipedia, Open Courseware, Khan Academy, Free Music Archive, 307 million images from Flickr, 39 million images from MusicBrainz, 10 million videos from YouTube, etc.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 406}], "Created by Lessig and Eldred in 2021 to bridge public domain and existing copyright", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 408}], "Many model developers license data for training foundation models", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 409}], "- Google and Reddit", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 409}], "https://www.reuters.com/technology/reddit-ai-content-licensing-deal-with-google-sources-say-2024-02-22/", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 410}], "- OpenAI and Shutterstock", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 410}], "https://investor.shutterstock.com/news-releases/news-release-details/shutterstock-expands-partnership-openai-signs-new-six-year", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 411}], "- OpenAI and StackExchange", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 411}], "https://stackoverflow.co/company/press/archive/openai-partnership", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 413}], "## Fair use (section 107)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 415}], "Four factors to determine whether fair use applies:", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 417}], "1. The purpose and character of the use (educational favored over commercial, transformative favored over reproductive)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 420}], "2. The nature of the copyrighted work (fictional favored over factual, creativitive over non-creative)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 423}], "3. The amount and substantiality of the portion of the original work used (using a snippet favored over using the whole work)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 426}], "4. The effect of the use upon the market (or potential market) for the original work", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 428}], "Examples of fair use:", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 429}], "- You watch a movie and write a summary of it", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 430}], "- Reimplement an algorithm (the idea) rather than copying the code (the expression)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 431}], "- Google Books index and show snippets (Authors Guild v. Google 2002-2013)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 433}], "Copyright is not about verbatim memorization", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 434}], "- Plots and characters (e.g., Mickey Mouse) can be copyrightable", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 435}], "- Parody is likely fair use", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 436}], "Copyright is about semantics (and economics)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 438}], "Considerations for foundation models", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 440}], "Copying data (first step of training) is violation already even if you don't do anything with it.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 441}], "Training an ML model is transformative (far from just copy/pasting)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 442}], "ML system is interested in idea (e.g., stop sign), not in the concrete expression (e.g., exact artistic choices of a particular image of a stop sign).", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 444}], "Problem: language models can definitely affect the market (writers, artists)", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 446}], "## Terms of service", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 448}], "Even if you have a license or can appeal to fair use, terms of service might impose additional restrictions.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 451}], "Example: YouTube's terms of service prohibits downloading videos, even if the videos are licensed under Creative Commons.", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 454}], "Course notes", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 454}], "https://stanford-cs324.github.io/winter2022/lectures/legality/", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 455}], "Fair learning [Lemley & Casey]", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 455}], "https://texaslawreview.org/fair-learning/", {"color": "gray"})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 456}], "Foundation models and fair use [Henderson+ 2023]", {})
addText([{"name": "lecture_13", "filename": "lecture_13.py", "lineno": 15}, {"name": "copyright", "filename": "lecture_13.py", "lineno": 456}], "https://arxiv.org/pdf/2303.15715", {"color": "gray"})