Skip to content

Commit

Permalink
add huffman
Browse files Browse the repository at this point in the history
  • Loading branch information
shellfly committed Feb 8, 2020
1 parent 751b0aa commit 2121281
Show file tree
Hide file tree
Showing 6 changed files with 321 additions and 1 deletion.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ Try to keep the interface and variable name consistent with the original book wh
* [TST](algs4/tst.py)
* [KMP](algs4/kmp.py)
* [NFA](algs4/nfa.py)
* [Huffman](algs4/huffman.py)
* [LZW](algs4/lzw.py)

## License

Expand Down
41 changes: 41 additions & 0 deletions algs4/binarydump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
* Execution: python binarydump.py n < file
* Data file: https://introcs.cs.princeton.edu/stdlib/abra.txt
*
* Reads in a binary file and writes out the bits, n per line.
*
* % more abra.txt
* ABRACADABRA!
*
* % python binarydump.py 16 < abra.txt
* 0100000101000010
* 0101001001000001
* 0100001101000001
* 0100010001000001
* 0100001001010010
* 0100000100100001
* 96 bits
*
"""

import sys
from algs4.binarystdin import BinaryStdin

bits_per_line = 16
if len(sys.argv) == 2:
bits_per_line = int(sys.argv[1])
count = 0
while not BinaryStdin.is_empty():
if bits_per_line == 0:
BinaryStdin.read_bool()
continue
elif count != 0 and count % bits_per_line == 0:
print()
if BinaryStdin.read_bool():
print(1, end="")
else:
print(0, end="")
count += 1
if bits_per_line != 0:
print()
print(count, "bits")
89 changes: 89 additions & 0 deletions algs4/binarystdin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import sys


class BinaryStdin:
buffer = 0
n = 0
initialized = False

@classmethod
def read_str(cls):
if cls.is_empty():
raise Exception("reading from empty input stream")
s = []
while not cls.is_empty():
b = cls.read_byte()
s.append(chr(b))
return "".join(s)

@classmethod
def read_int(cls):
if cls.is_empty():
raise Exception("reading from empty input stream")
x = 0
for i in range(4):
b = cls.read_byte()
x <<= 8
x |= b
return x

@classmethod
def read_int_r(cls, r):
if r < 1 or r > 32:
raise Exception("invalid r")
if r == 32:
return cls.read_int()
x = 0
for i in range(r):
x <<= 1
bit = cls.read_bool()
if bit:
x |= 1
return x

@classmethod
def read_byte(cls):
if cls.is_empty():
raise Exception("reading from empty input stream")
if cls.n == 8:
b = cls.buffer
cls.fill_buffer()
return b
x = cls.buffer
x <<= (8-cls.n)
old_n = cls.n
cls.fill_buffer()
cls.n = old_n
x |= (cls.buffer >> cls.n)
return x & 0xff

@classmethod
def read_bool(cls):
if cls.is_empty():
raise Exception("reading from empty input stream")
cls.n -= 1
bit = (cls.buffer >> cls.n & 1) == 1
if cls.n == 0:
cls.fill_buffer()
return bit

@classmethod
def fill_buffer(cls):
byte = sys.stdin.buffer.read(1)
if byte == b'':
cls.buffer = EOFError
cls.n = -1
return
cls.n = 8
cls.buffer = ord(byte)

@classmethod
def initialize(cls):
cls.fill_buffer()
cls.initialized = True

@classmethod
def is_empty(cls):
if not cls.initialized:
cls.initialize()
return cls.buffer == EOFError
53 changes: 53 additions & 0 deletions algs4/binarystdout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import sys


class BinaryStdout:
buffer = 0
n = 0

@classmethod
def write_int(cls, x):
cls.write_byte((x >> 24) & 0xff)
cls.write_byte((x >> 16) & 0xff)
cls.write_byte((x >> 8) & 0xff)
cls.write_byte((x >> 0) & 0xff)

@classmethod
def write_bits(cls, x, r):
if r < 1 or r > 32:
raise Exception("invalid r")
if r == 32:
return cls.write_int(x)
for i in range(r):
bit = ((x >> (r - i - 1)) & 1) == 1
cls.write_bit(bit)

@classmethod
def write_byte(cls, b):
for i in range(8):
bit = ((b >> (8 - i - 1)) & 1) == 1
cls.write_bit(bit)

@classmethod
def write_bit(cls, bit):
cls.buffer <<= 1
if bit:
cls.buffer |= 1

cls.n += 1
if cls.n == 8:
cls.clear_buffer()

@classmethod
def clear_buffer(cls):
if cls.n == 0:
return
if cls.n > 0:
cls.buffer <<= (8-cls.n)
sys.stdout.buffer.write(bytes([cls.buffer]))
cls.n = 0
cls.buffer = 0

@classmethod
def close(cls):
cls.clear_buffer()
135 changes: 135 additions & 0 deletions algs4/huffman.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""
* Execution: python huffman.py - < input.txt (compress)
* Execution: python huffman.py + < input.txt (expand)
* Data files: https://algs4.cs.princeton.edu/55compression/abra.txt
* https://algs4.cs.princeton.edu/55compression/tinytinyTale.txt
* https://algs4.cs.princeton.edu/55compression/medTale.txt
* https://algs4.cs.princeton.edu/55compression/tale.txt
*
* Compress or expand a binary input stream using the Huffman algorithm.
*
* % python huffman.py - < abra.txt | python binarydump.py 60
* 010100000100101000100010010000110100001101010100101010000100
* 000000000000000000000000000110001111100101101000111110010100
* 120 bits
*
* % python huffman.py - < abra.txt | python huffman.py +
* ABRACADABRA!
*
"""

from algs4.binarystdin import BinaryStdin
from algs4.binarystdout import BinaryStdout
from algs4.min_pq import MinPQ


class Node:
def __init__(self, ch, freq, left, right):
self.ch = ch
self.freq = freq
self.left = left
self.right = right

def __str__(self):
return "%s %d" % (self.ch, self.freq)

def is_leaf(self):
return self.left == None and self.right == None

def __lt__(self, other):
return self.freq < other.freq

def __gt__(self, other):
return self.freq > other.freq


class Huffman:
R = 256
@classmethod
def compress(cls):
s = BinaryStdin.read_str()
freq = [0 for _ in range(cls.R)]
for i in range(len(s)):
freq[ord(s[i])] += 1

# build huffman trie
root = cls.build_trie(freq)

# build code table
st = [None for _ in range(cls.R)]
cls.build_code(st, root, "")

# print trie for decoder
cls.write_trie(root)

# print number of bytes in original uncompressed message
BinaryStdout.write_int(len(s))
# use Huffman code to encode input
for i in range(len(s)):
code = st[ord(s[i])]
for j in range(len(code)):
if code[j] == "0":
BinaryStdout.write_bit(False)
elif code[j] == "1":
BinaryStdout.write_bit(True)
BinaryStdout.close()

@classmethod
def build_trie(cls, freq):
pq = MinPQ()
for c in range(cls.R):
if freq[c] > 0:
pq.insert(Node(chr(c), freq[c], None, None))
while pq.size() > 1:
left = pq.del_min()
right = pq.del_min()
parent = Node(chr(0), left.freq+right.freq, left, right)
pq.insert(parent)
return pq.del_min()

@classmethod
def write_trie(cls, x):
if x.is_leaf():
BinaryStdout.write_bit(True)
BinaryStdout.write_byte(ord(x.ch))
return
BinaryStdout.write_bit(False)
cls.write_trie(x.left)
cls.write_trie(x.right)

@classmethod
def build_code(cls, st, x, s):
if not x.is_leaf():
cls.build_code(st, x.left, s+"0")
cls.build_code(st, x.right, s+"1")
else:
st[ord(x.ch)] = s

@classmethod
def expand(cls):
root = read_trie()
length = BinaryStdin.read_int()
for i in range(length):
x = root
while not x.is_leaf():
bit = BinaryStdin.read_bool()
if bit:
x = x.right
else:
x = x.left
BinaryStdout.write_byte(ord(x.ch))


def read_trie():
is_leaf = BinaryStdin.read_bool()
if is_leaf:
return Node(chr(BinaryStdin.read_byte()), 0, None, None)
return Node(chr(0), 0, read_trie(), read_trie())


if __name__ == '__main__':
import sys
if sys.argv[1] == "-":
Huffman.compress()
else:
Huffman.expand()
2 changes: 1 addition & 1 deletion algs4/min_pq.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def sink(self, k):
if j < N - 1 and self.pq[j] > self.pq[j + 1]:
j += 1

if self.pq[k] < self.pq[j]:
if not self.pq[j] < self.pq[k]:
break

self.pq[k], self.pq[j] = self.pq[j], self.pq[k]
Expand Down

0 comments on commit 2121281

Please sign in to comment.