-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhuffman.py
156 lines (123 loc) · 5.56 KB
/
huffman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Huffman Algorithm Implementation
# This implementation works without logging each step happening
# head for [huffman_log.py] for more information
from queue import PriorityQueue
import time
# a class representing a node in the tree we will build for the algorithm
class HuffmanNode:
def __init__(self, char, freq, timestamp):
self.char = char
self.freq = freq
self.timestamp = timestamp
self.left = None # left node
self.right = None # right node
# less than operator
def __lt__(self, other):
# if two nodes have the same frequency, then the one with the smallest timestamp should be smaller
if self.freq == other.freq:
return self.timestamp > other.timestamp
else:
return self.freq < other.freq
# calculate how many times each character appears in the data
def get_frequency_dict(data):
freq_dict = {}
for char in data:
if char in freq_dict:
freq_dict[char] += 1
else:
freq_dict[char] = 1
return freq_dict
# construct a tree starting from the frequency table generated at the beginning
# till the tree is fully build
def build_huffman_tree(freq_dict):
pq = PriorityQueue()
# add all characters to a tree
current_time = time.time()
for char, freq in freq_dict.items():
node = HuffmanNode(char, freq, current_time)
pq.put(node)
current_time = time.time()
# take two nodes with the lowest frequency and add them to the tree
while pq.qsize() > 1:
last_node = pq.get()
pre_last_node = pq.get()
merged_node = HuffmanNode(pre_last_node.char + last_node.char, last_node.freq + pre_last_node.freq, time.time())
merged_node.left = pre_last_node
merged_node.right = last_node
pq.put(merged_node)
freq_dict[merged_node.char] = merged_node.freq
# get the last node which is the root of the tree
root = pq.get()
return root
# traverse the tree nodes extracting the generated character codes
def traverse_tree(node, code_dict, code=""):
# base condition: calling on a leaf node then calling on the left and right (None) stops
if node is None:
return
# add/update the code of this character
if node.char is not None:
code_dict[node.char] = code
# recursively call the function on the left and right child nodes
traverse_tree(node.left, code_dict, code + "0")
traverse_tree(node.right, code_dict, code + "1")
# using the mechanisms introduced above, encode the data
def huffman_encoding(data):
freq_dict = get_frequency_dict(data)
if len(freq_dict) == 1:
code_dict = {list(freq_dict.keys())[0]: "0"}
else:
root = build_huffman_tree(freq_dict)
code_dict = {}
traverse_tree(root, code_dict)
# Filter the code dictionary to only include keys from the provided data
code_dict = {key: char for key, char in code_dict.items() if key in data}
# for each character in the provided data, retrieve its corresponding
# code and concatenate it to a string forming the compressed data
encoded_data = "".join([code_dict[char] for char in data])
return encoded_data, code_dict
# decode the encoded data again
def huffman_decoding(data, code_dict):
# swap characters with their codes
reversed_dict = {code: char for char, code in code_dict.items()}
decoded_data = ""
current_code = ""
# for each bit in the encoded data, add the bit to the current code
for bit in data:
current_code += bit
# then search for the current code in the codes map
# if found, add the character associated with the code
# to the decoded string and reset the code to begin
# searching for another code
if current_code in reversed_dict:
char = reversed_dict[current_code]
decoded_data += char
current_code = ""
return decoded_data
# this function provides the binary equivalent to the data that a typical computer system would generate
def binary_encode(text):
code_dict = {}
binary_text = ""
for char in text:
# use the built-in ord function to get the ASCII code for each character and then converts the ASCII code
# to binary using the built-in bin function. The bin function returns a string representing the binary
# value, with a prefix of '0b'. This prefix is removed by slicing the string starting from the 2nd character.
# The resulting binary string is then zero-padded to 8 digits using the zfill() method to ensure that each
# character is encoded in a consistent 8-bit format.
binary_char = bin(ord(char))[2:].zfill(8)
binary_text += binary_char
# construct code map
if char not in code_dict:
code_dict[char] = binary_char
return binary_text, code_dict
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
original_text = "aaaaaaaaaaaaaaabbbbbbbccccccddddddeeeee"
uncompressed_output, fixed_length_codes = binary_encode(original_text)
compressed_output, variable_length_codes = huffman_encoding(original_text)
compression_ratio = ((len(compressed_output) / len(uncompressed_output)) * 100)
print(f"original text: {original_text}")
print(f"uncompressed binary representation: {uncompressed_output}")
print(f"fixed-length codes used: {fixed_length_codes}")
print(f"compressed binary representation: {compressed_output}")
print(f"variable-length codes used: {variable_length_codes}")
print(f"compression ratio: {round(compression_ratio, 2)}%")