-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhuffman.py
399 lines (348 loc) · 14.4 KB
/
huffman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
import array_list
import linked_list
import sys
from huffman_bits_io import *
sys.setrecursionlimit(10000)
# string -> boolean
# takes a file name, returns true if the file exists and can be read, false otherwise
def check_file(file_name):
try:
file_obj = open(file_name, 'r')
file_obj.close()
return True
except:
return False
# -> ArrayList
# Returns a blank ArrayList of size 256
def gen_256_list():
my_array = array_list.List([0]*256, 256, 256)
return my_array
# string -> list
# takes in a file name and counts the occurrence of each character in a text file, returns the frequency of each
# character in the list
def count_occurrence(file_name):
f = open(file_name, 'r')
file_string = f.read()
my_array = gen_256_list()
for elm in file_string:
location_val = array_list.get(my_array, ord(elm)) + 1
array_list.set(my_array, ord(elm), location_val)
f.close()
return my_array
# A HuffmanTree is either:
# - Leaf(ascii_rep, freq)
# - Node(ascii_rep, freq, left, right)
# A Leaf is an
# ascii_rep representing a value of an ascii character
# freq representing the frequency that character occurs
class Leaf:
def __init__(self, ascii_rep, freq):
self.ascii_rep = ascii_rep # int
self.freq = freq # int
def __eq__(self, other):
return (type(other) == Leaf and
self.ascii_rep == other.ascii_rep and
self.freq == other.freq)
def __repr__(self):
return '(Leaf: {}, {})'.format(chr(self.ascii_rep), self.freq)
# A Node is either
# ascii_rep representing a value of an ascii character
# freq representing the frequency that character occurs
# a left subtree and a right subtree
class Node:
def __init__(self, ascii_rep, freq, left, right):
self.ascii_rep = ascii_rep # int or None
self.freq = freq # int
self.left = left # Huffman Tree
self.right = right # Huffman Tree
def __eq__(self, other):
return (type(other) == Node and
self.ascii_rep == other.ascii_rep and
self.freq == other.freq and
self.right == other.right and
self.left == other.left)
def __repr__(self):
return '((Node: {}, {}) Left: {}, Right: {})'.format(chr(self.ascii_rep), self.freq, self.left, self.right)
# a HuffCode is an ascii representation of a character, and it's huffman code
class HuffCode:
def __init__(self, ascii_rep, code):
self.ascii_rep = ascii_rep # an int
self.code = code # a string
def __eq__(self, other):
return type(other) == HuffCode and self.ascii_rep == other.ascii_rep and self.code == other.code
def __repr__(self):
return '(Code: {}, {})'.format(chr(self.ascii_rep), self.code)
# HuffmanTree -> string
# Traverses the tree and generates a list of characters in pre-order
def traverse_tree_char(tree, my_string = ''):
if tree is not None:
if type(tree) == Leaf:
my_string += chr(tree.ascii_rep)
if type(tree) != Leaf:
my_string += traverse_tree_char(tree.left)
my_string += traverse_tree_char(tree.right)
return my_string
# HuffmanTree HuffmanTree -> boolean
# Returns true if the first HuffmanTree comes before the second one
def comes_before(a, b):
if a.freq < b.freq:
return True
if b.freq < a.freq:
return False
return a.ascii_rep < b.ascii_rep
# ArrayList -> LinkedList
# Returns a LinkedList in sorted order of every character that appears in the ArrayList
def list_to_leafs(lst):
if lst is None:
return None
leaf_list = linked_list.empty_list()
for i in range(256):
if lst.a_list[i] != 0:
leaf_list = linked_list.insert_sorted(leaf_list, Leaf(i, lst.a_list[i]), comes_before)
return leaf_list
# ArrayList -> HuffmanTree
# Takes in an ArrayList representing every character in the file, returns a HuffmanTree
def build_tree(lst):
sorted_list = list_to_leafs(lst)
if sorted_list is None:
return None
if sorted_list.rest is None:
leaf0, sorted_list = linked_list.remove(sorted_list, 0)
return leaf0
while (sorted_list.rest != None):
leaf0, sorted_list = linked_list.remove(sorted_list, 0)
leaf1, sorted_list = linked_list.remove(sorted_list, 0)
if leaf0.ascii_rep < leaf1.ascii_rep:
node = Node(leaf0.ascii_rep, leaf0.freq + leaf1.freq, leaf0, leaf1)
if leaf1.ascii_rep < leaf0.ascii_rep:
node = Node(leaf1.ascii_rep, leaf0.freq + leaf1.freq, leaf0, leaf1)
sorted_list = linked_list.insert_sorted(sorted_list, node, comes_before)
return sorted_list.first
# HuffmanTree ArrayList string -> ArrayList
# Takes in a HuffmanTree and returns an ArrayList containing the ascii value and huffman code of each element
def gen_code(huff_tree, lst, code = ''):
if huff_tree is None:
return None
if type(huff_tree) == Leaf:
lst = array_list.set(lst, huff_tree.ascii_rep, HuffCode(huff_tree.ascii_rep, code))
if type(huff_tree) != Leaf:
lst = gen_code(huff_tree.left, lst, code + '0')
lst = gen_code(huff_tree.right, lst, code + '1')
return lst
# string Array -> string
# Takes in an input file and the code values for the file, and returns the encoded string
def generate_string(file_name, code_array):
f = open(file_name, 'r')
unencoded_string = f.read()
f.close()
encoded_string = ''
for elm in unencoded_string:
elm_val = array_list.get(code_array, ord(elm))
encoded_string += elm_val.code
return encoded_string
# ArrayList -> int
# takes the list of codes and returns the number of codes in the list
def count_codes(lst):
count = 0
if lst is None:
return 0
if lst == gen_256_list():
return 0
for elm in lst.a_list:
if type(elm) == HuffCode:
count += 1
return count
# string string -> None
# Takes the names of two files and writes the huffman encoded version of the input file to the output w/ a decode header
def huffman_encode(input_file, output_file):
if not check_file(input_file):
print('Invalid Encode File')
return None
occurrence_array = count_occurrence(input_file)
hb_writer = HuffmanBitsWriter(output_file)
huff_tree = build_tree(occurrence_array)
huff_codes = gen_code(huff_tree, gen_256_list())
num_codes = count_codes(huff_codes)
encoded_string = generate_string(input_file, huff_codes)
hb_writer.write_byte(num_codes)
for i in range(256):
if occurrence_array.a_list[i] != 0:
hb_writer.write_byte(i)
hb_writer.write_int(occurrence_array.a_list[i])
hb_writer.write_code(encoded_string)
hb_writer.close()
return traverse_tree_char(huff_tree)
# fileObj HuffmanTree -> string
# reads bits from the file and traverses the tree to get to a leaf, returns leaf value
def traverse(huff_tree, file_obj):
if type(huff_tree) == Leaf:
return chr(huff_tree.ascii_rep)
bit = file_obj.read_bit()
if bit is False:
return traverse(huff_tree.left, file_obj)
if bit is True:
return traverse(huff_tree.right, file_obj)
# string string -> None
# Takes a compressed file and an output file, decompresses the file and writes it to the output file
def huffman_decode(input_file, output_file):
if not check_file(input_file):
print('Invalid Decode File')
return None
hb_reader = HuffmanBitsReader(input_file)
num_codes = hb_reader.read_byte()
occurrence_array = gen_256_list()
for i in range(num_codes):
ascii_val = hb_reader.read_byte()
freq = hb_reader.read_int()
occurrence_array = array_list.set(occurrence_array, ascii_val, freq)
huff_tree = build_tree(occurrence_array)
f = open(output_file, 'wb')
if huff_tree is None:
hb_reader.close()
f.close()
return None
for i in range(huff_tree.freq):
val = traverse(huff_tree, hb_reader)
f.write(val.encode())
f.close()
hb_reader.close()
return None
class HuffmanTest(unittest.TestCase):
def test_file_exists1(self):
file_name = 'test.txt'
self.assertTrue(check_file(file_name))
def test_file_exists2(self):
file_name = 'fake.txt'
self.assertFalse(check_file(file_name))
def test_count1(self):
file_name = 'test.txt'
my_array = count_occurrence(file_name)
self.assertEqual(array_list.get(my_array, ord('\n')), 0)
self.assertEqual(array_list.get(my_array, ord('a')), 4)
self.assertEqual(array_list.get(my_array, ord('b')), 3)
self.assertEqual(array_list.get(my_array, ord(' ')), 3)
def test_leaf1(self):
my_leaf = Leaf(97, 3)
self.assertEqual(repr(my_leaf), '(Leaf: a, 3)')
def test_Node(self):
my_leaf = Leaf(97, 3)
my_leaf1= Leaf(98, 1)
my_node = Node(97, 4, my_leaf, my_leaf1)
self.assertEqual(repr(my_node), '((Node: a, 4) Left: (Leaf: a, 3), Right: (Leaf: b, 1))' )
def test_travers1(self):
node = None
my_string = traverse_tree_char(node)
self.assertEqual(my_string, '')
def test_travers2(self):
my_leaf = Leaf(97, 3)
my_leaf1 = Leaf(98, 1)
my_node = Node(97, 2, my_leaf, my_leaf1)
my_string = traverse_tree_char(my_node)
self.assertEqual(my_string, 'ab')
def test_comes_before1(self):
my_leaf = Leaf(97, 3)
my_leaf1 = Leaf(98, 1)
self.assertTrue(comes_before(my_leaf1, my_leaf))
self.assertFalse(comes_before(my_leaf, my_leaf1))
my_leaf = Leaf(97, 3)
my_leaf1 = Leaf(98, 3)
self.assertTrue(comes_before(my_leaf, my_leaf1))
def test_insert_sorted1(self):
my_list = linked_list.Pair(Leaf(99, 1),
linked_list.Pair(Leaf(97, 2),
linked_list.Pair(Leaf(101, 5),
linked_list.Pair(Leaf(102, 5), None))))
my_list = linked_list.insert_sorted(my_list, Leaf(104, 3), comes_before)
check_list = linked_list.Pair(Leaf(99, 1),
linked_list.Pair(Leaf(97, 2),
linked_list.Pair(Leaf(104, 3),
linked_list.Pair(Leaf(101, 5),
linked_list.Pair(Leaf(102, 5), None)))))
self.assertEqual(my_list, check_list)
my_list = linked_list.insert_sorted(my_list, Leaf(105, 5), comes_before)
my_list = linked_list.insert_sorted(my_list, Leaf(99, 5), comes_before)
check_list = linked_list.Pair(Leaf(99, 1),
linked_list.Pair(Leaf(97, 2),
linked_list.Pair(Leaf(104, 3),
linked_list.Pair(Leaf(99, 5),
linked_list.Pair(Leaf(101, 5),
linked_list.Pair(Leaf(102, 5),
linked_list.Pair(Leaf(105, 5), None)))))))
self.assertEqual(my_list, check_list)
def test_build_tree1(self):
file_name = 'test.txt'
my_array = count_occurrence(file_name)
my_tree = build_tree(my_array)
test_tree = Node(32, 13,
Node(32, 6,
Leaf(32, 3),
Leaf(98, 3)),
Node(97, 7,
Node(99, 3,
Leaf(100, 1),
Leaf(99, 2)),
Leaf(97, 4)))
self.assertEqual(my_tree, test_tree)
def test_build_tree2(self):
my_array = None
my_tree = build_tree(my_array)
self.assertEqual(my_tree, None)
def test_HuffCode(self):
my_code = HuffCode(97, '001')
self.assertEqual(repr(my_code), '(Code: a, 001)')
def test_gen_code1(self):
file_name = 'test.txt'
my_array = count_occurrence(file_name)
my_tree = build_tree(my_array)
code_array = gen_code(my_tree, gen_256_list())
self.assertEqual(array_list.get(code_array, 32), HuffCode(32, '00'))
self.assertEqual(array_list.get(code_array, 100), HuffCode(100, '100'))
def test_gen_code2(self):
file_name = 'empty.txt'
my_array = count_occurrence(file_name)
my_tree = build_tree(my_array)
code_array = gen_code(my_tree, gen_256_list())
self.assertEqual(code_array, None)
def test_gen_code3(self):
file_name = '1.txt'
my_array = count_occurrence(file_name)
my_tree = build_tree(my_array)
code_array = gen_code(my_tree, gen_256_list())
self.assertEqual(array_list.get(code_array, 97), HuffCode(97, ''))
def test_gen_string1(self):
file_name = 'example.txt'
my_array = count_occurrence(file_name)
my_tree = build_tree(my_array)
code_array = gen_code(my_tree, gen_256_list())
my_string = generate_string(file_name, code_array)
self.assertEqual(my_string, '11011011000011011010011010011')
def test_count_code(self):
my_list = gen_256_list()
count = count_codes(my_list)
self.assertEqual(count, 0)
def test_huff_encode_decode1(self):
a = huffman_encode('example.txt', 'output.bin')
huffman_decode('output.bin', 'decode.txt')
self.assertEqual(a, ' bdca')
def test_huff_encode_decode2(self):
a = huffman_encode('1.txt', 'output1.bin')
huffman_decode('output1.bin', 'decode1.txt')
self.assertEqual(a, 'a')
def test_huff_encode_decode3(self):
a = huffman_encode('empty.txt', 'outputempty.bin')
huffman_decode('outputempty.bin', 'decodeempty.txt')
self.assertEqual(a, '')
def test_huff_encode_decode4(self):
a = huffman_encode('big_test.txt', 'outputbig.bin')
huffman_decode('outputbig.bin', 'decodebig.txt')
self.assertEqual(a, 'af4367eqthgrwy\nds')
def test_huff_encode_decode5(self):
s = huffman_encode('fake.txt', 'outputtest.bin')
self.assertEqual(s, None)
self.assertEqual(huffman_decode('notreal.bin', 'decodetest.txt'), None)
def test_huff_encode_decode6(self):
s = huffman_encode('new.txt', 'outputnew.bin')
self.assertEqual(s, 'a!,cbd.')
self.assertEqual(huffman_decode('outputnew.bin', 'decodenew.txt'), None)
if __name__ == '__main__':
unittest.main()