diff --git a/README.md b/README.md index f8f4615..f045c5a 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,8 @@ Try to keep the interface and variable name consistent with the original book wh * [TST](algs4/tst.py) * [KMP](algs4/kmp.py) * [NFA](algs4/nfa.py) + * [Huffman](algs4/huffman.py) + * [LZW](algs4/lzw.py) ## License diff --git a/algs4/binarydump.py b/algs4/binarydump.py new file mode 100644 index 0000000..1416e30 --- /dev/null +++ b/algs4/binarydump.py @@ -0,0 +1,41 @@ +""" + * Execution: python binarydump.py n < file + * Data file: https://introcs.cs.princeton.edu/stdlib/abra.txt + * + * Reads in a binary file and writes out the bits, n per line. + * + * % more abra.txt + * ABRACADABRA! + * + * % python binarydump.py 16 < abra.txt + * 0100000101000010 + * 0101001001000001 + * 0100001101000001 + * 0100010001000001 + * 0100001001010010 + * 0100000100100001 + * 96 bits + * +""" + +import sys +from algs4.binarystdin import BinaryStdin + +bits_per_line = 16 +if len(sys.argv) == 2: + bits_per_line = int(sys.argv[1]) +count = 0 +while not BinaryStdin.is_empty(): + if bits_per_line == 0: + BinaryStdin.read_bool() + continue + elif count != 0 and count % bits_per_line == 0: + print() + if BinaryStdin.read_bool(): + print(1, end="") + else: + print(0, end="") + count += 1 +if bits_per_line != 0: + print() +print(count, "bits") diff --git a/algs4/binarystdin.py b/algs4/binarystdin.py new file mode 100644 index 0000000..0a14e43 --- /dev/null +++ b/algs4/binarystdin.py @@ -0,0 +1,89 @@ +import sys + + +class BinaryStdin: + buffer = 0 + n = 0 + initialized = False + + @classmethod + def read_str(cls): + if cls.is_empty(): + raise Exception("reading from empty input stream") + s = [] + while not cls.is_empty(): + b = cls.read_byte() + s.append(chr(b)) + return "".join(s) + + @classmethod + def read_int(cls): + if cls.is_empty(): + raise Exception("reading from empty input stream") + x = 0 + for i in range(4): + b = cls.read_byte() + x <<= 8 + x |= b + return x + + @classmethod + def read_int_r(cls, r): + if r < 1 or r > 32: + raise Exception("invalid r") + if r == 32: + return cls.read_int() + x = 0 + for i in range(r): + x <<= 1 + bit = cls.read_bool() + if bit: + x |= 1 + return x + + @classmethod + def read_byte(cls): + if cls.is_empty(): + raise Exception("reading from empty input stream") + if cls.n == 8: + b = cls.buffer + cls.fill_buffer() + return b + x = cls.buffer + x <<= (8-cls.n) + old_n = cls.n + cls.fill_buffer() + cls.n = old_n + x |= (cls.buffer >> cls.n) + return x & 0xff + + @classmethod + def read_bool(cls): + if cls.is_empty(): + raise Exception("reading from empty input stream") + cls.n -= 1 + bit = (cls.buffer >> cls.n & 1) == 1 + if cls.n == 0: + cls.fill_buffer() + return bit + + @classmethod + def fill_buffer(cls): + byte = sys.stdin.buffer.read(1) + if byte == b'': + cls.buffer = EOFError + cls.n = -1 + return + cls.n = 8 + cls.buffer = ord(byte) + + @classmethod + def initialize(cls): + cls.fill_buffer() + cls.initialized = True + + @classmethod + def is_empty(cls): + if not cls.initialized: + cls.initialize() + return cls.buffer == EOFError diff --git a/algs4/binarystdout.py b/algs4/binarystdout.py new file mode 100644 index 0000000..c6c66d2 --- /dev/null +++ b/algs4/binarystdout.py @@ -0,0 +1,53 @@ +import sys + + +class BinaryStdout: + buffer = 0 + n = 0 + + @classmethod + def write_int(cls, x): + cls.write_byte((x >> 24) & 0xff) + cls.write_byte((x >> 16) & 0xff) + cls.write_byte((x >> 8) & 0xff) + cls.write_byte((x >> 0) & 0xff) + + @classmethod + def write_bits(cls, x, r): + if r < 1 or r > 32: + raise Exception("invalid r") + if r == 32: + return cls.write_int(x) + for i in range(r): + bit = ((x >> (r - i - 1)) & 1) == 1 + cls.write_bit(bit) + + @classmethod + def write_byte(cls, b): + for i in range(8): + bit = ((b >> (8 - i - 1)) & 1) == 1 + cls.write_bit(bit) + + @classmethod + def write_bit(cls, bit): + cls.buffer <<= 1 + if bit: + cls.buffer |= 1 + + cls.n += 1 + if cls.n == 8: + cls.clear_buffer() + + @classmethod + def clear_buffer(cls): + if cls.n == 0: + return + if cls.n > 0: + cls.buffer <<= (8-cls.n) + sys.stdout.buffer.write(bytes([cls.buffer])) + cls.n = 0 + cls.buffer = 0 + + @classmethod + def close(cls): + cls.clear_buffer() diff --git a/algs4/huffman.py b/algs4/huffman.py new file mode 100644 index 0000000..0a1428d --- /dev/null +++ b/algs4/huffman.py @@ -0,0 +1,135 @@ +""" + * Execution: python huffman.py - < input.txt (compress) + * Execution: python huffman.py + < input.txt (expand) + * Data files: https://algs4.cs.princeton.edu/55compression/abra.txt + * https://algs4.cs.princeton.edu/55compression/tinytinyTale.txt + * https://algs4.cs.princeton.edu/55compression/medTale.txt + * https://algs4.cs.princeton.edu/55compression/tale.txt + * + * Compress or expand a binary input stream using the Huffman algorithm. + * + * % python huffman.py - < abra.txt | python binarydump.py 60 + * 010100000100101000100010010000110100001101010100101010000100 + * 000000000000000000000000000110001111100101101000111110010100 + * 120 bits + * + * % python huffman.py - < abra.txt | python huffman.py + + * ABRACADABRA! + * +""" + +from algs4.binarystdin import BinaryStdin +from algs4.binarystdout import BinaryStdout +from algs4.min_pq import MinPQ + + +class Node: + def __init__(self, ch, freq, left, right): + self.ch = ch + self.freq = freq + self.left = left + self.right = right + + def __str__(self): + return "%s %d" % (self.ch, self.freq) + + def is_leaf(self): + return self.left == None and self.right == None + + def __lt__(self, other): + return self.freq < other.freq + + def __gt__(self, other): + return self.freq > other.freq + + +class Huffman: + R = 256 + @classmethod + def compress(cls): + s = BinaryStdin.read_str() + freq = [0 for _ in range(cls.R)] + for i in range(len(s)): + freq[ord(s[i])] += 1 + + # build huffman trie + root = cls.build_trie(freq) + + # build code table + st = [None for _ in range(cls.R)] + cls.build_code(st, root, "") + + # print trie for decoder + cls.write_trie(root) + + # print number of bytes in original uncompressed message + BinaryStdout.write_int(len(s)) + # use Huffman code to encode input + for i in range(len(s)): + code = st[ord(s[i])] + for j in range(len(code)): + if code[j] == "0": + BinaryStdout.write_bit(False) + elif code[j] == "1": + BinaryStdout.write_bit(True) + BinaryStdout.close() + + @classmethod + def build_trie(cls, freq): + pq = MinPQ() + for c in range(cls.R): + if freq[c] > 0: + pq.insert(Node(chr(c), freq[c], None, None)) + while pq.size() > 1: + left = pq.del_min() + right = pq.del_min() + parent = Node(chr(0), left.freq+right.freq, left, right) + pq.insert(parent) + return pq.del_min() + + @classmethod + def write_trie(cls, x): + if x.is_leaf(): + BinaryStdout.write_bit(True) + BinaryStdout.write_byte(ord(x.ch)) + return + BinaryStdout.write_bit(False) + cls.write_trie(x.left) + cls.write_trie(x.right) + + @classmethod + def build_code(cls, st, x, s): + if not x.is_leaf(): + cls.build_code(st, x.left, s+"0") + cls.build_code(st, x.right, s+"1") + else: + st[ord(x.ch)] = s + + @classmethod + def expand(cls): + root = read_trie() + length = BinaryStdin.read_int() + for i in range(length): + x = root + while not x.is_leaf(): + bit = BinaryStdin.read_bool() + if bit: + x = x.right + else: + x = x.left + BinaryStdout.write_byte(ord(x.ch)) + + +def read_trie(): + is_leaf = BinaryStdin.read_bool() + if is_leaf: + return Node(chr(BinaryStdin.read_byte()), 0, None, None) + return Node(chr(0), 0, read_trie(), read_trie()) + + +if __name__ == '__main__': + import sys + if sys.argv[1] == "-": + Huffman.compress() + else: + Huffman.expand() diff --git a/algs4/min_pq.py b/algs4/min_pq.py index 1281a2c..4f7b670 100644 --- a/algs4/min_pq.py +++ b/algs4/min_pq.py @@ -37,7 +37,7 @@ def sink(self, k): if j < N - 1 and self.pq[j] > self.pq[j + 1]: j += 1 - if self.pq[k] < self.pq[j]: + if not self.pq[j] < self.pq[k]: break self.pq[k], self.pq[j] = self.pq[j], self.pq[k]