From 8a34e3f9d0393d25310936db48e0a742b8c8367f Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sat, 28 Sep 2024 13:13:18 +0200
Subject: [PATCH 01/10] TST: Add lzw.lzw_encode

---
 pypdf/lzw.py      | 103 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/test_lzw.py |  24 +++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 pypdf/lzw.py
 create mode 100644 tests/test_lzw.py

diff --git a/pypdf/lzw.py b/pypdf/lzw.py
new file mode 100644
index 000000000..43b7c8d1b
--- /dev/null
+++ b/pypdf/lzw.py
@@ -0,0 +1,103 @@
+"""Lempel-Ziv-Welch (LZW) adaptive compression method."""
+
+from typing import List
+
+CLEAR_TABLE_MARKER = 256
+EOD_MARKER = 257
+
+# Data encoded using the LZW compression method shall consist of
+# a sequence of codes that are 9 to 12 bits long
+MAX_CODE_WIDTH = 12
+
+
+def lzw_encode(data: bytes) -> bytes:
+    """
+    Encode byte data with LZW compression.
+
+    Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding".
+    """
+    max_table_size = 1 << MAX_CODE_WIDTH  # 4096
+
+    # the 258 fixed codes
+    table = {bytes([i]): i for i in range(256)}
+
+    next_code = 258
+    result_codes = []
+
+    # The encoder shall begin by issuing a clear-table code
+    result_codes.append(CLEAR_TABLE_MARKER)
+
+    string = b""
+    for int_character in data:
+        character = bytes([int_character])
+        if string + character in table:
+            # Accumulate a sequence of one or more input characters
+            # matching a sequence already present in the table.
+            # For maximum compression, the encoder looks for the longest
+            # such sequence.
+            string += character
+        else:
+            # Emit the code corresponding to that sequence.
+            result_codes.append(table[string])
+
+            # Before adding a new entry, check if the table is full
+            if len(table) >= max_table_size:
+                # Table is full, emit clear-table code and reset
+                result_codes.append(CLEAR_TABLE_MARKER)
+                table = {bytes([i]): i for i in range(256)}
+                next_code = 258
+                # bits_per_code will be reset in pack_codes_into_bytes
+            else:
+                # Add new sequence to the table
+                table[string + character] = next_code
+                next_code += 1
+
+            string = character
+
+    # Ensure everything actually is encoded
+    if string:
+        result_codes.append(table[string])
+
+    result_codes.append(EOD_MARKER)
+
+    return pack_codes_into_bytes(result_codes)
+
+
+def pack_codes_into_bytes(result_codes: List[int]) -> bytes:
+    """Convert the result code list into bytes."""
+    bits_per_code = 9  # Initially, the code length shall be 9 bits
+    max_code = 1 << bits_per_code  # 512
+    buffer = 0
+    bits_in_buffer = 0
+    output = []
+
+    for code in result_codes:
+        buffer = (buffer << bits_per_code) | code
+        bits_in_buffer += bits_per_code
+
+        # Codes shall be packed into a continuous bit stream, high-order bit
+        # first. This stream shall then be divided into bytes, high-order bit
+        # first.
+        while bits_in_buffer >= 8:
+            bits_in_buffer -= 8
+            output.append((buffer >> bits_in_buffer) & 0xFF)
+
+        # Handle bits_per_code reset after clear-table code
+        if code == CLEAR_TABLE_MARKER:
+            bits_per_code = 9
+            max_code = 1 << bits_per_code
+            continue
+
+        # Whenever both the encoder and the decoder independently (but
+        # synchronously) realize that the current code length is no longer
+        # sufficient to represent the number of entries in the table, they shall
+        # increase the number of bits per code by 1.
+        if code >= max_code - 1 and bits_per_code < MAX_CODE_WIDTH:
+            bits_per_code += 1
+            max_code <<= 1
+
+    # Flush the buffer
+    if bits_in_buffer > 0:
+        output.append((buffer << (8 - bits_in_buffer)) & 0xFF)
+
+    return bytes(output)
diff --git a/tests/test_lzw.py b/tests/test_lzw.py
new file mode 100644
index 000000000..565846eeb
--- /dev/null
+++ b/tests/test_lzw.py
@@ -0,0 +1,24 @@
+"""Test LZW-related code."""
+
+import pytest
+
+from pypdf.filters import LZWDecode
+from pypdf.lzw import lzw_encode
+
+test_cases = [
+    pytest.param(b"", id="Empty input"),  # Empty input
+    pytest.param(b"A", id="Single character"),
+    pytest.param(b"AAAAAA", id="Repeating character"),
+    pytest.param(b"Hello, World!", id="Simple text"),
+    pytest.param(b"ABABABABABAB", id="Repeating pattern"),
+    pytest.param(b"The quick brown fox jumps over the lazy dog", id="Longer text"),
+    pytest.param(b"\x00\xFF\x00\xFF", id="Binary data"),
+]
+
+
+@pytest.mark.parametrize("data", test_cases)
+def test_encode_decode(data):
+    """Decoder and encoder match."""
+    compressed_data = lzw_encode(data)
+    decoded = LZWDecode._decodeb(compressed_data)
+    assert decoded == data

From 609cd7a96952d0fb5354c9a1856bf7eb57d3498a Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sat, 28 Sep 2024 14:37:04 +0200
Subject: [PATCH 02/10] Add failing test

---
 tests/test_lzw.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/test_lzw.py b/tests/test_lzw.py
index 565846eeb..100219394 100644
--- a/tests/test_lzw.py
+++ b/tests/test_lzw.py
@@ -6,13 +6,17 @@
 from pypdf.lzw import lzw_encode
 
 test_cases = [
-    pytest.param(b"", id="Empty input"),  # Empty input
+    pytest.param(b"", id="Empty input"),
     pytest.param(b"A", id="Single character"),
     pytest.param(b"AAAAAA", id="Repeating character"),
     pytest.param(b"Hello, World!", id="Simple text"),
     pytest.param(b"ABABABABABAB", id="Repeating pattern"),
     pytest.param(b"The quick brown fox jumps over the lazy dog", id="Longer text"),
     pytest.param(b"\x00\xFF\x00\xFF", id="Binary data"),
+    pytest.param(
+        b"BBBCBDBEBFBGBHBIBJBKBLBMBNBOBPBQBRBSBTBUBVBWBXBYBZB[B\\B]B^B_B`BaBbBcBdBeBfBgBhBiBjBkBlBmBnBoBpBqBrBsBtBuBvBwBxByCBCCCDCECFCGCHCICJCKCLCMCNCOCPCQCRCSCTCUCVCWCXCYCZC[C\\C]C^C_C`CaCbCcCdCeCfCgChCiCjCkClCmCnCoCpCqCrCsCtCuCvCwCxCyDBDCDDDEDFDGDHDIDJDKDLDMDNDODPDQDRDSDTDUDVDWDXDYDZD[D\\D]D^D_D`DaDbDcDdDeDfDgDhDiDjDkDlDmDnDoDpDqDrDsDtDuDvDwDxDyEBECEDEEEFEGEHEIEJEKELEMENEOEPEQERESETEUEVEWEXEYEZE[E\\E]E^E_E`EaEbEcEdEeEfEgEhEiEjEkElEmEnEoEpEqErEsEtEuEvEwExEyFBFCFDFEFFFGFHFIFJFKFLFMFNFOFPFQFRFSFTFUFVFWFXFYFZF[F\\F]F^F_F`FaFbFcFdFeFfFgFhFiFjFkFlFmFnFoFpFqFrFsFtFuFvFwFxFyGBGCGDGEGFGGGHGIGJGKGLGMGNGOGPGQGRGSGTGUGVGWGXGYGZG[G\\G]G^G_G`GaGbGcGdGeGfGgGhGiGjGkGlGmGnGoGpGqGrGsGtGuGvGwGxGyHBHCHDHEHFHGHHHIHJHKHLHMHNHOHPHQHRHSHTHUHVHWHXHYHZH[H\\H]H^H_H`HaHbHcHdHeHfHgHhHiHjHkHlHmHnHoHpHqHrHsHtHuHvHwHxHyIBICIDIEIFIGIHIIIJIKILIMINIOIPIQIRISITIUIVIWIXIYIZI[I\\I]I^I_I`IaIbIcIdIeIfIgIhIiIjIkIlImInIoIpIqIrIsItIuIvIwIxIyJBJCJDJEJFJGJHJIJJJKJLJMJNJOJPJQJRJSJTJUJVJWJXJYJZJ[J\\J]J^J_J`JaJbJcJdJeJfJgJhJiJjJkJlJmJnJoJpJqJrJsJtJuJvJwJxJyKBKCKDKEKFKGKHKIKJKKKLKMKNKOKPKQKRKSKTKUKVKWKXKYKZK[K\\K]K^K_K`KaKbKcKdKeKfKgKhKiKjKkKlKmKnKoKpKqKrKsKtKuKvKwKxKyLBLCLDLELFLGLHLILJLKLLLMLNLOLPLQLRLSLTLULVLWLXLYLZL[L\\L]L^L_L`LaLbLcLdLeLfLgLhLiLjLkLlLmLnLoLpLqLrLsLtLuLvLwLxLyMBMCMDMEMFMGMHMIMJMKMLMMMNMOMPMQMRMSMTMUMVMWMXMYMZM[M\\M]M^M_M`MaMbMcMdMeMfMgMhMiMjMkMlMmMnMoMpMqMrMsMtMuMvMwMxMyNBNCNDNENFNGNHNINJNKNLNMNNNONPNQNRNSNTNUNVNWNXNYNZN[N\\N]N^N_N`NaNbNcNdNeNfNgNhNiNjNkNlNmNnNoNpNqNrNsNtNuNvNwNxNyOBOCODOEOFOGOHOIOJOKOLOMONOOOPOQOROSOTOUOVOWOXOYOZO[O\\O]O^O_O`OaObOcOdOeOfOgOhOiOjOkOlOmOnOoOpOqOrOsOtOuOvOwOxOyPBPCPDPEPFPGPHPIPJPKPLPMPNPOPPPQPRPSPTPUPVPWPXPYPZP[P\\P]P^P_P`PaPbPcPdPePfPgPhPiPjPkPlPmPnPoPpPqPrPsPtPuPvPwPxPyQBQCQDQEQFQGQHQIQJQKQLQMQNQOQPQQQRQSQTQUQVQWQXQYQZQ[Q\\Q]Q^Q_Q`QaQbQcQdQeQfQgQhQiQjQkQlQmQnQoQpQqQrQsQtQuQvQwQxQyRBRCRDRERFRGRHRIRJRKRLRMRNRORPRQRRRSRTRURVRWRXRYRZR[R\\R]R^R_R`RaRbRcRdReRfRgRhRiRjRkRlRmRnRoRpRqRrRsRtRuRvRwRxRySBSCSDSESFSGSHSISJSKSLSMSNSOSPSQSRSSSTSUSVSWSXSYSZS[S\\S]S^S_S`SaSbScSdSeSfSgShSiSjSkSlSmSnSoSpSqSrSsStSuSvSwSxSyTBTCTDTETFTGTHTITJTKTLTMTNTOTPTQTRTSTTTUTVTWTXTYTZT[T\\T]T^T_T`TaTbTcTdTeTfTgThTiTjTkTlTmTnToTpTqTrTsTtTuTvTwTxTyUBUCUDUEUFUGUHUIUJUKULUMUNUOUPUQURUSUTUUUVUWUXUYUZU[U\\U]U^U_U`UaUbUcUdUeUfUgUhUiUjUkUlUmUnUoUpUqUrUsUtUuUvUwUxUyVBVCVDVEVFVGVHVIVJVKVLVMVNVOVPVQVRVSVTVUVVVWVXVYVZV[V\\V]V^V_V`VaVbVcVdVeVfVgVhViVjVkVlVmVnVoVpVqVrVsVtVuVvVwVxVyWBWCWDWEWFWGWHWIWJWKWLWMWNWOWPWQWRWSWTWUWVWWWXWYWZW[W\\W]W^W_W`WaWbWcWdWeWfWgWhWiWjWkWlWmWnWoWpWqWrWsWtWuWvWwWxWyXBXCXDXEXFXGXHXIXJXKXLXMXNXOXPXQXRXSXTXUXVXWXXXYXZX[X\\X]X^X_X`XaXbXcXdXeXfXgXhXiXjXkXlXmXnXoXpXqXrXsXtXuXvXwXxXyYBYCYDYEYFYGYHYIYJYKYLYMYNYOYPYQYRYSYTYUYVYWYXYYYZY[Y\\Y]Y^Y_Y`YaYbYcYdYeYfYgYhYiYjYkYlYmYnYoYpYqYrYsYtYuYvYwYxYyZBZCZDZEZFZGZHZIZJZKZLZMZNZOZPZQZRZSZTZUZVZWZXZYZZZ[Z\\Z]Z^Z_Z`ZaZbZcZdZeZfZgZhZiZjZkZlZmZnZoZpZqZrZsZtZuZvZwZxZy[B[C[D[E[F[G[H[I[J[K[L[M[N[O[P[Q[R[S[T[U[V[W[X[Y[Z[[[\\[][^[_[`[a[b[c[d[e[f[g[h[i[j[k[l[m[n[o[p[q[r[s[t[u[v[w[x[y\\B\\C\\D\\E\\F\\G\\H\\I\\J\\K\\L\\M\\N\\O\\P\\Q\\R\\S\\T\\U\\V\\W\\X\\Y\\Z\\[\\\\\\]\\^\\_\\`\\a\\b\\c\\d\\e\\f\\g\\h\\i\\j\\k\\l\\m\\n\\o\\p\\q\\r\\s\\t\\u\\v\\w\\x\\y]B]C]D]E]F]G]H]I]J]K]L]M]N]O]P]Q]R]S]T]U]V]W]X]Y]Z][]\\]]]^]_]`]a]b]c]d]e]f]g]h]i]j]k]l]m]n]o]p]q]r]s]t]u]v]w]x]y^B^C^D^E^F^G^H^I^J^K^L^M^N^O^P^Q^R^S^T^U^V^W^X^Y^Z^[^\\^]^^^_^`^a^b^c^d^e^f^g^h^i^j^k^l^m^n^o^p^q^r^s^t^u^v^w^x^y_B_C_D_E_F_G_H_I_J_K_L_M_N_O_P_Q_R_S_T_U_V_W_X_Y_Z_[_\\_]_^___`_a_b_c_d_e_f_g_h_i_j_k_l_m_n_o_p_q_r_s_t_u_v_w_x_y`B`C`D`E`F`G`H`I`J`K`L`M`N`O`P`Q`R`S`T`U`V`W`X`Y`Z`[`\\`]`^`_```a`b`c`d`e`f`g`h`i`j`k`l`m`n`o`p`q`r`s`t`u`v`w`x`yaBaCaDaEaFaGaHaIaJaKaLaMaNaOaPaQaRaSaTaUaVaWaXaYaZa[a\\a]a^a_a`aaabacadaeafagahaiajakalamanaoapaqarasatauavawaxaybBbCbDbEbFbGbHbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb[b\\b]b^b_b`babbbcbdbebfbgbhbibjbkblbmbnbobpbqbrbsbtbubvbwbxbycBcCcDcEcFcGcHcIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXcYcZc[c\\c]c^c_c`cacbcccdcecfcgchcicjckclcmcncocpcqcrcsctcucvcwcxcydBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdwdxdyeBeCeDeEeFeGeHeIeJeKeLeMeNeOePeQeReSeTeUeVeWeXeYeZe[e\\e]e^e_e`eaebecedeeefegeheiejekelemeneoepeqereseteuevewexeyfBfCfDfEfFfGfHfIfJfKfLfMfNfOfPfQfRfSfTfUfVfWfXfYfZf[f\\f]f^f_f`fafbfcfdfefffgfhfifjfkflfmfnfofpfqfrfsftfufvfwfxfygBgCgDgEgFgGgHgIgJgKgLgMgNgOgPgQgRgSgTgUgVgWgXgYgZg[g\\g]g^g_g`gagbgcgdgegfggghgigjgkglgmgngogpgqgrgsgtgugvgwgxgyhBhChDhEhFhGhHhIhJhKhLhMhNhOhPhQhRhShThUhVhWhXhYhZh[h\\h]h^h_h`hahbhchdhehfhghhhihjhkhlhmhnhohphqhrhshthuhvhwhxhyiBiCiDiEiFiGiHiIiJiKiLiMiNiOiPiQiRiSiTiUiViWiXiYiZi[i\\i]i^i_i`iaibicidieifigihiiijikiliminioipiqirisitiuiviwixiyjBjCjDjEjFjGjHjIjJjKjLjMjNjOjPjQjRjSjTjUjVjWjXjYjZj[j\\j]j^j_j`jajbjcjdjejfjgjhjijjjkjljmjnjojpjqjrjsjtjujvjwjxjykBkCkDkEkFkGkHkIkJkKkLkMkNkOkPkQkRkSkTkUkVkWkXkYkZk[k\\k]k^k_k`kakbkckdkekfkgkhkikjkkklkmknkokpkqkrksktkukvkwkxkylBlClDlElFlGlHlIlJlKlLlMlNlOlPlQlRlSlTlUlVlWlXlYlZl[l\\l]l^l_l`lalblcldlelflglhliljlklllmlnlolplqlrlsltlulvlwlxlymBmCmDmEmFmGmHmImJmKmLmMmNmOmPmQmRmSmTmUmVmWmXmYmZm[m\\m]m^m_m`mambmcmdmemfmgmhmimjmkmlmmmnmompmqmrmsmtmumvmwmxmynBnCnDnEnFnGnHnInJnKnLnMnNnOnPnQnRnSnTnUnVnWnXnYnZn[n\\n]n^n_n`nanbncndnenfngnhninjnknlnmnnnonpnqnrnsntnunvnwnxnyoBoCoDoEoFoGoHoIoJoKoLoMoNoOoPoQoRoSoToUoVoWoXoYoZo[o\\o]o^o_o`oaobocodoeofogohoiojokolomonooopoqorosotouovowoxoypBpCpDpEpFpGpHpIpJpKpLpMpNpOpPpQpRpSpTpUpVpWpXpYpZp[p\\p]p^p_p`papbpcpdpepfpgphpipjpkplpmpnpopppqprpsptpupvpwpxpyqBqCqDqEqFqGqHqIqJqKqLqMqNqOqPqQqRqSqTqUqVqWqXqYqZq[q\\q]q^q_q`qaqbqcqdqeqfqgqhqiqjqkqlqmqnqoqpqqqrqsqtquqvqwqxqyrBrCrDrErFrGrHrIrJrKrLrMrNrOrPrQrRrSrTrUrVrWrXrYrZr[r\\r]r^r_r`rarbrcrdrerfrgrhrirjrkrlrmrnrorprqrrrsrtrurvrwrxrysBsCsDsEsFsGsHsIsJsKsLsMsNsOsPsQsRsSsTsUsVsWsXsYsZs[s\\s]s^s_s`sasbscsdsesfsgshsisjskslsmsnsospsqsrssstsusvswsxsytBtCtDtEtFtGtHtItJtKtLtMtNtOtPtQtRtStTtUtVtWtXtYtZt[t\\t]t^t_t`tatbtctdtetftgthtitjtktltmtntotptqtrtstttutvtwtxtyuBuCuDuEuFuGuHuIuJuKuLuMuNuOuPuQuRuSuTuUuVuWuXuYuZu[u\\u]u^u_u`uaubucudueufuguhuiujukulumunuoupuqurusutuuuvuwuxuyvBvCvDvEvFvGvHvIvJvKvLvMvNvOvPvQvRvSvTvUvVvWvXvYvZv[v\\v]v^v_v`vavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvywBwCwDwEwFwGwHwIwJwKwLwMwNwOwPwQwRwSwTwUwVwWwXwYwZw[w\\w]w^w_w`wawbwcwdwewfwgwhwiwjwkwlwmwnwowpwqwrwswtwuwvwwwxwyxBxCxDxExFxGxHxIxJxKxLxMxNxOxPxQxRxSxTxUxVxWxXxYxZx[x\\x]x^x_x`xaxbxcxdxexfxgxhxixjxkxlxmxnxoxpxqxrxsxtxuxvxwxxxyyByCyDyEyFyGyHyIyJyKyLyMyNyOyPyQyRySyTyUyVyWyXyYyZy[y\\y]y^y_y`yaybycydyeyfygyhyiyjykylymynyoypyqyrysytyuyvywyxyyBBBBBCBBDBBEBBFBBGBBHBBIBBJBBKBBLBBMBBNBBOBBPBBQBBRBBSBBTBBUBBVBBWBBXBBYBBZBB[BB\\BB]BB^BB_BB`BBaBBbBBcBBdBBeBBfBBgBBhBBiBBjBBkBBlBBmBBnBBoBBpBBqBBrBBsBBtBBuBBvBBwBBxBByBCBBCCBCDBCEBCFBCGBCHBCIBCJBCKBCLBCMBCNBCOBCPBCQBCRBCSBCTBCUBCVBCWBCXBCYBCZBC[BC\\BC]BC^BC_BC`BCaBCbBCcBCdBCeBCfBCgBChBCiBCjBCkBClBCmBCnBCoBCpBCqBCrBCsBCtBCuBCvBCwBCxBCyBDBBDCBDDBDEBDFBDGBDHBDIBDJBDKBDLBDMBDNBDOBDPBDQBDRBDSBDTBDUBDVBDWBDXBDYBDZBD[BD\\BD]BD^BD_BD`BDaBDbBDcBDdBDeBDfBDgBDhBDiBDjBDkBDlBDmBDnBDoBDpBDqBDrBDsBDtBDuBDvBDwBDxBDyBEBBECBEDBEEBEFBEGBEHBEIBEJBEKBELBEMBENBEOBEPBEQBERBESBETBEUBEVBEWBEXBEYBEZBE[BE\\BE]BE^BE_BE`BEaBEbBEcBEdBEeBEfBEgBEhBEiBEjBEkBElBEmBEnBEoBEpBEqBErBEsBEtBEuBEvBEwBExBEyBFBBFCBFDBFEBFFBFGBFHBFIBFJBFKBFLBFMBFNBFOBFPBFQBFRBFSBFTBFUBFVBFWBFXBFYBFZBF[BF\\BF]BF^BF_BF`BFaBFbBFcBFdBFeBFfBFgBFhBFiBFjBFkBFlBFmBFnBFoBFpBFqBFrBFsBFtBFuBFvBFwBFxBFyBGBBGCBGDBGEBGFBGGBGHBGIBGJBGKBGLBGMBGNBGOBGPBGQBGRBGSBGTBGUBGVBGWBGXBGYBGZBG[BG\\BG]BG^BG_BG`BGaBGbBGcBGdBGeBGfBGgBGhBGiBGjBGkBGlBGmBGnBGoBGpBGqBGrBGsBGtBGuBGvBGwBGxBGyBHBBHCBHDBHEBHFBHGBHHBHIBHJBHKBHLBHMBHNBHOBHPBHQBHRBHSBHTBHUBHVBHWBHXBHYBHZBH[BH\\BH]BH^BH_BH`BHaBHbBHcBHdBHeBHfBHgBHhBHiBHjBHkBHlBHmBHnBHoBHpBHqBHrBHsBHtBHuBHvBHwBHxBHyBIBBICBIDBIEBIFBIGBIHBIIBIJBIKBILBIMBINBIOBIPBIQBIRBISBITBIUBIVBIWBIXBIYBIZBI[BI\\BI]BI^BI_BI`BIaBIbBIcBIdBIeBIfBIgBIhBIiBIjBIkBIlBImBInBIoBIpBIqBIrBIsBItBIuBIvBIwBIxBIyBJBBJCBJDBJEBJFBJGBJHBJIBJJBJKBJLBJMBJNBJOBJPBJQBJRBJSBJTBJUBJVBJWBJXBJYBJZBJ[BJ\\BJ]BJ^BJ_BJ`BJaBJbBJcBJdBJeBJfBJgBJhBJiBJjBJkBJlBJmBJnBJoBJpBJqBJrBJsBJtBJuBJvBJwBJxBJyBKBBKCBKDBKEBKFBKGBKHBKIBKJBKKBKLBKMBKNBKOBKPBKQBKRBKSBKTBKUBKVBKWBKXBKYBKZBK[BK\\BK]BK^BK_BK`BKaBKbBKcBKdBKeBKfBKgBKhBKiBKjBKkBKlBKmBKnBKoBKpBKqBKrBKsBKtBKuBKvBKwBKxBKyBLBBLCBLDBLEBLFBLGBLHBLIBLJBLKBLLBLMBLNBLOBLPBLQBLRBLSBLTBLUBLVBLWBLXBLYBLZBL[BL\\BL]BL^BL_BL`BLaBLbBLcBLdBLeBLfBLgBLhBLiBLjBLkBLlBLmBLnBLoBLpBLqBLrBLsBLtBLuBLvBLwBLxBLyBMBBMCBMDBMEBMFBMGBMHBMIBMJBMKBMLBMMBMNBMOBMPBMQBMRBMSBMTBMUBMVBMWBMXBMYBMZBM[BM\\BM]BM^BM_BM`BMaBMbBMcBMdBMeBMfBMgBMhBMiBMjBMkBMlBMmBMnBMoBMpBMqBMrBMsBMtBMuBMvBMwBMxBMyBNBBNCBNDBNEBNFBNGBNHBNIBNJBNKBNLBNMBNNBNOBNPBNQBNRBNSBNTBNUBNVBNWBNXBNYBNZBN[BN\\BN]BN^BN_",
+        id="Table overflow",
+    ),
 ]
 
 
@@ -21,4 +25,9 @@ def test_encode_decode(data):
     """Decoder and encoder match."""
     compressed_data = lzw_encode(data)
     decoded = LZWDecode._decodeb(compressed_data)
+    msg = f"len(data)={len(data)}, len(decoded)={len(decoded)}, len(compressed_data)={len(compressed_data)}"
+    if decoded != data:
+        for i in range(len(data)):
+            assert decoded[i] == data[i], msg
+
     assert decoded == data

From 32c18bca701b740a5492f3b6604eba25d3d874ba Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Sep 2024 10:00:35 +0200
Subject: [PATCH 03/10] Use Codec-class

---
 pypdf/lzw.py | 200 ++++++++++++++++++++++++++-------------------------
 1 file changed, 102 insertions(+), 98 deletions(-)

diff --git a/pypdf/lzw.py b/pypdf/lzw.py
index 43b7c8d1b..44b0fe817 100644
--- a/pypdf/lzw.py
+++ b/pypdf/lzw.py
@@ -2,102 +2,106 @@
 
 from typing import List
 
-CLEAR_TABLE_MARKER = 256
-EOD_MARKER = 257
-
-# Data encoded using the LZW compression method shall consist of
-# a sequence of codes that are 9 to 12 bits long
-MAX_CODE_WIDTH = 12
-
-
-def lzw_encode(data: bytes) -> bytes:
-    """
-    Encode byte data with LZW compression.
-
-    Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding".
-    """
-    max_table_size = 1 << MAX_CODE_WIDTH  # 4096
-
-    # the 258 fixed codes
-    table = {bytes([i]): i for i in range(256)}
-
-    next_code = 258
-    result_codes = []
-
-    # The encoder shall begin by issuing a clear-table code
-    result_codes.append(CLEAR_TABLE_MARKER)
-
-    string = b""
-    for int_character in data:
-        character = bytes([int_character])
-        if string + character in table:
-            # Accumulate a sequence of one or more input characters
-            # matching a sequence already present in the table.
-            # For maximum compression, the encoder looks for the longest
-            # such sequence.
-            string += character
-        else:
-            # Emit the code corresponding to that sequence.
-            result_codes.append(table[string])
-
-            # Before adding a new entry, check if the table is full
-            if len(table) >= max_table_size:
-                # Table is full, emit clear-table code and reset
-                result_codes.append(CLEAR_TABLE_MARKER)
-                table = {bytes([i]): i for i in range(256)}
-                next_code = 258
-                # bits_per_code will be reset in pack_codes_into_bytes
+
+class LzwCodec:
+    CLEAR_TABLE_MARKER = 256
+    EOD_MARKER = 257
+
+    # Data encoded using the LZW compression method shall consist of
+    # a sequence of codes that are 9 to 12 bits long
+    MAX_CODE_WIDTH = 12
+
+    def __init__(self) -> None:
+        self.clear_table()
+
+    def clear_table(self) -> None:
+        """Clear the table."""
+        # the 258 fixed codes
+        self.table = {bytes([i]): i for i in range(256)}
+        self.next_code = 258
+
+    def encode(self, data: bytes) -> bytes:
+        """
+        Encode byte data with LZW compression.
+
+        Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding".
+        """
+        max_table_size = 1 << self.MAX_CODE_WIDTH  # 4096
+
+        result_codes = []
+
+        # The encoder shall begin by issuing a clear-table code
+        result_codes.append(self.CLEAR_TABLE_MARKER)
+
+        string = b""
+        for int_character in data:
+            character = bytes([int_character])
+            if string + character in self.table:
+                # Accumulate a sequence of one or more input characters
+                # matching a sequence already present in the table.
+                # For maximum compression, the encoder looks for the longest
+                # such sequence.
+                string += character
             else:
-                # Add new sequence to the table
-                table[string + character] = next_code
-                next_code += 1
-
-            string = character
-
-    # Ensure everything actually is encoded
-    if string:
-        result_codes.append(table[string])
-
-    result_codes.append(EOD_MARKER)
-
-    return pack_codes_into_bytes(result_codes)
-
-
-def pack_codes_into_bytes(result_codes: List[int]) -> bytes:
-    """Convert the result code list into bytes."""
-    bits_per_code = 9  # Initially, the code length shall be 9 bits
-    max_code = 1 << bits_per_code  # 512
-    buffer = 0
-    bits_in_buffer = 0
-    output = []
-
-    for code in result_codes:
-        buffer = (buffer << bits_per_code) | code
-        bits_in_buffer += bits_per_code
-
-        # Codes shall be packed into a continuous bit stream, high-order bit
-        # first. This stream shall then be divided into bytes, high-order bit
-        # first.
-        while bits_in_buffer >= 8:
-            bits_in_buffer -= 8
-            output.append((buffer >> bits_in_buffer) & 0xFF)
-
-        # Handle bits_per_code reset after clear-table code
-        if code == CLEAR_TABLE_MARKER:
-            bits_per_code = 9
-            max_code = 1 << bits_per_code
-            continue
-
-        # Whenever both the encoder and the decoder independently (but
-        # synchronously) realize that the current code length is no longer
-        # sufficient to represent the number of entries in the table, they shall
-        # increase the number of bits per code by 1.
-        if code >= max_code - 1 and bits_per_code < MAX_CODE_WIDTH:
-            bits_per_code += 1
-            max_code <<= 1
-
-    # Flush the buffer
-    if bits_in_buffer > 0:
-        output.append((buffer << (8 - bits_in_buffer)) & 0xFF)
-
-    return bytes(output)
+                # Emit the code corresponding to that sequence.
+                result_codes.append(self.table[string])
+
+                # Before adding a new entry, check if the table is full
+                if len(self.table) >= max_table_size:
+                    # Table is full, emit clear-table code and reset
+                    result_codes.append(self.CLEAR_TABLE_MARKER)
+                    self.clear_table()
+                    # bits_per_code will be reset in pack_codes_into_bytes
+                else:
+                    # Add new sequence to the table ..
+                    self.table[string + character] = self.next_code
+                    self.next_code += 1
+
+                string = character
+
+        # Ensure everything actually is encoded
+        if string:
+            result_codes.append(self.table[string])
+
+        result_codes.append(self.EOD_MARKER)
+
+        return self.pack_codes_into_bytes(result_codes)
+
+    def pack_codes_into_bytes(self, result_codes: List[int]) -> bytes:
+        """Convert the result code list into bytes."""
+        bits_per_code = 9  # Initially, the code length shall be 9 bits
+        max_code = 1 << bits_per_code  # 512
+        buffer = 0
+        bits_in_buffer = 0
+        output = []
+
+        for code in result_codes:
+            buffer = (buffer << bits_per_code) | code
+            bits_in_buffer += bits_per_code
+
+            # Codes shall be packed into a continuous bit stream, high-order bit
+            # first. This stream shall then be divided into bytes, high-order bit
+            # first.
+            while bits_in_buffer >= 8:
+                bits_in_buffer -= 8
+                output.append((buffer >> bits_in_buffer) & 0xFF)
+
+            # Handle bits_per_code reset after clear-table code
+            if code == self.CLEAR_TABLE_MARKER:
+                bits_per_code = 9
+                max_code = 1 << bits_per_code
+                continue
+
+            # Whenever both the encoder and the decoder independently (but
+            # synchronously) realize that the current code length is no longer
+            # sufficient to represent the number of entries in the table, they shall
+            # increase the number of bits per code by 1.
+            if code >= max_code - 1 and bits_per_code < self.MAX_CODE_WIDTH:
+                bits_per_code += 1
+                max_code <<= 1
+
+        # Flush the buffer
+        if bits_in_buffer > 0:
+            output.append((buffer << (8 - bits_in_buffer)) & 0xFF)
+
+        return bytes(output)

From fa6dfb3d4cb0e259d26eb088f80b7cd3d70182a3 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Sep 2024 10:12:13 +0200
Subject: [PATCH 04/10] Refinements

---
 pypdf/lzw.py      | 75 ++++++++++++++++++++++-------------------------
 tests/test_lzw.py |  5 ++--
 2 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/pypdf/lzw.py b/pypdf/lzw.py
index 44b0fe817..e5b9eb042 100644
--- a/pypdf/lzw.py
+++ b/pypdf/lzw.py
@@ -4,75 +4,73 @@
 
 
 class LzwCodec:
-    CLEAR_TABLE_MARKER = 256
-    EOD_MARKER = 257
+    """Lempel-Ziv-Welch (LZW) adaptive compression codec."""
 
-    # Data encoded using the LZW compression method shall consist of
-    # a sequence of codes that are 9 to 12 bits long
-    MAX_CODE_WIDTH = 12
+    CLEAR_TABLE_MARKER = 256  # Special code to indicate table reset
+    EOD_MARKER = 257  # End-of-data marker
+    MAX_CODE_WIDTH = 12  # Codes can range from 9 to 12 bits
 
     def __init__(self) -> None:
+        """Initialize codec and reset the compression table."""
         self.clear_table()
 
     def clear_table(self) -> None:
-        """Clear the table."""
-        # the 258 fixed codes
+        """Reset the encoding table to initial state with single-byte sequences."""
         self.table = {bytes([i]): i for i in range(256)}
-        self.next_code = 258
+        self.next_code = self.EOD_MARKER + 1
 
     def encode(self, data: bytes) -> bytes:
         """
-        Encode byte data with LZW compression.
+        Encode data using the LZW compression algorithm.
 
         Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding".
         """
-        max_table_size = 1 << self.MAX_CODE_WIDTH  # 4096
-
+        max_table_size = 1 << self.MAX_CODE_WIDTH  # 4096 entries when fully expanded
         result_codes = []
 
         # The encoder shall begin by issuing a clear-table code
         result_codes.append(self.CLEAR_TABLE_MARKER)
 
-        string = b""
-        for int_character in data:
-            character = bytes([int_character])
-            if string + character in self.table:
-                # Accumulate a sequence of one or more input characters
-                # matching a sequence already present in the table.
-                # For maximum compression, the encoder looks for the longest
-                # such sequence.
-                string += character
+        current_sequence = b""
+        for byte in data:
+            next_sequence = current_sequence + bytes([byte])
+
+            if next_sequence in self.table:
+                # Extend current sequence if already in the table
+                current_sequence = next_sequence
             else:
-                # Emit the code corresponding to that sequence.
-                result_codes.append(self.table[string])
+                # Output code for the current sequence
+                result_codes.append(self.table[current_sequence])
 
-                # Before adding a new entry, check if the table is full
+                # If the table is full, emit a clear-table command
                 if len(self.table) >= max_table_size:
-                    # Table is full, emit clear-table code and reset
                     result_codes.append(self.CLEAR_TABLE_MARKER)
                     self.clear_table()
-                    # bits_per_code will be reset in pack_codes_into_bytes
                 else:
-                    # Add new sequence to the table ..
-                    self.table[string + character] = self.next_code
+                    # Add the new sequence to the table
+                    self.table[next_sequence] = self.next_code
                     self.next_code += 1
 
-                string = character
+                # Reset to the new character
+                current_sequence = bytes([byte])
 
         # Ensure everything actually is encoded
-        if string:
-            result_codes.append(self.table[string])
+        if current_sequence:
+            result_codes.append(self.table[current_sequence])
 
         result_codes.append(self.EOD_MARKER)
 
         return self.pack_codes_into_bytes(result_codes)
 
     def pack_codes_into_bytes(self, result_codes: List[int]) -> bytes:
-        """Convert the result code list into bytes."""
+        """
+        Convert the list of result codes into a continuous byte stream, with codes packed as per the current bit-width.
+        The bit-width starts at 9 bits and expands as needed.
+        """
         bits_per_code = 9  # Initially, the code length shall be 9 bits
         max_code = 1 << bits_per_code  # 512
-        buffer = 0
-        bits_in_buffer = 0
+        buffer = 0  # Temporary storage for bits to be packed into bytes
+        bits_in_buffer = 0  # Number of bits currently in the buffer
         output = []
 
         for code in result_codes:
@@ -86,21 +84,18 @@ def pack_codes_into_bytes(self, result_codes: List[int]) -> bytes:
                 bits_in_buffer -= 8
                 output.append((buffer >> bits_in_buffer) & 0xFF)
 
-            # Handle bits_per_code reset after clear-table code
+            # After a clear-table marker, reset to 9-bit codes
             if code == self.CLEAR_TABLE_MARKER:
                 bits_per_code = 9
                 max_code = 1 << bits_per_code
                 continue
 
-            # Whenever both the encoder and the decoder independently (but
-            # synchronously) realize that the current code length is no longer
-            # sufficient to represent the number of entries in the table, they shall
-            # increase the number of bits per code by 1.
+            # Expand the code width if the next code exceeds the current range
             if code >= max_code - 1 and bits_per_code < self.MAX_CODE_WIDTH:
                 bits_per_code += 1
-                max_code <<= 1
+                max_code <<= 1  # Double the range for the new bit-width
 
-        # Flush the buffer
+        # Flush any remaining bits in the buffer
         if bits_in_buffer > 0:
             output.append((buffer << (8 - bits_in_buffer)) & 0xFF)
 
diff --git a/tests/test_lzw.py b/tests/test_lzw.py
index 100219394..16398a3b8 100644
--- a/tests/test_lzw.py
+++ b/tests/test_lzw.py
@@ -3,7 +3,7 @@
 import pytest
 
 from pypdf.filters import LZWDecode
-from pypdf.lzw import lzw_encode
+from pypdf.lzw import LzwCodec
 
 test_cases = [
     pytest.param(b"", id="Empty input"),
@@ -23,7 +23,8 @@
 @pytest.mark.parametrize("data", test_cases)
 def test_encode_decode(data):
     """Decoder and encoder match."""
-    compressed_data = lzw_encode(data)
+    codec = LzwCodec()
+    compressed_data = codec.encode(data)
     decoded = LZWDecode._decodeb(compressed_data)
     msg = f"len(data)={len(data)}, len(decoded)={len(decoded)}, len(compressed_data)={len(compressed_data)}"
     if decoded != data:

From 5cc104896d6cb5669a1d626621eca8324b611c02 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Sep 2024 10:21:07 +0200
Subject: [PATCH 05/10] Fix off-by-one errors

---
 pypdf/lzw.py | 77 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/pypdf/lzw.py b/pypdf/lzw.py
index e5b9eb042..133277b3b 100644
--- a/pypdf/lzw.py
+++ b/pypdf/lzw.py
@@ -1,6 +1,6 @@
 """Lempel-Ziv-Welch (LZW) adaptive compression method."""
 
-from typing import List
+from typing import Dict, List
 
 
 class LzwCodec:
@@ -8,7 +8,8 @@ class LzwCodec:
 
     CLEAR_TABLE_MARKER = 256  # Special code to indicate table reset
     EOD_MARKER = 257  # End-of-data marker
-    MAX_CODE_WIDTH = 12  # Codes can range from 9 to 12 bits
+    INITIAL_BITS_PER_CODE = 9  # Initial code bit width
+    MAX_BITS_PER_CODE = 12  # Maximum code bit width
 
     def __init__(self) -> None:
         """Initialize codec and reset the compression table."""
@@ -16,8 +17,10 @@ def __init__(self) -> None:
 
     def clear_table(self) -> None:
         """Reset the encoding table to initial state with single-byte sequences."""
-        self.table = {bytes([i]): i for i in range(256)}
+        self.table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
         self.next_code = self.EOD_MARKER + 1
+        self.bits_per_code = self.INITIAL_BITS_PER_CODE
+        self.max_code_value = (1 << self.bits_per_code) - 1
 
     def encode(self, data: bytes) -> bytes:
         """
@@ -25,11 +28,11 @@ def encode(self, data: bytes) -> bytes:
 
         Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding".
         """
-        max_table_size = 1 << self.MAX_CODE_WIDTH  # 4096 entries when fully expanded
-        result_codes = []
+        result_codes: List[int] = []
 
         # The encoder shall begin by issuing a clear-table code
         result_codes.append(self.CLEAR_TABLE_MARKER)
+        self.clear_table()
 
         current_sequence = b""
         for byte in data:
@@ -42,16 +45,23 @@ def encode(self, data: bytes) -> bytes:
                 # Output code for the current sequence
                 result_codes.append(self.table[current_sequence])
 
-                # If the table is full, emit a clear-table command
-                if len(self.table) >= max_table_size:
-                    result_codes.append(self.CLEAR_TABLE_MARKER)
-                    self.clear_table()
-                else:
-                    # Add the new sequence to the table
+                # Add the new sequence to the table if there's room
+                if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1:
                     self.table[next_sequence] = self.next_code
                     self.next_code += 1
+                    # Increase bits_per_code if necessary
+                    if (
+                        self.next_code > self.max_code_value
+                        and self.bits_per_code < self.MAX_BITS_PER_CODE
+                    ):
+                        self.bits_per_code += 1
+                        self.max_code_value = (1 << self.bits_per_code) - 1
+                else:
+                    # If the table is full, emit a clear-table command
+                    result_codes.append(self.CLEAR_TABLE_MARKER)
+                    self.clear_table()
 
-                # Reset to the new character
+                # Start new sequence
                 current_sequence = bytes([byte])
 
         # Ensure everything actually is encoded
@@ -62,18 +72,20 @@ def encode(self, data: bytes) -> bytes:
 
         return self.pack_codes_into_bytes(result_codes)
 
-    def pack_codes_into_bytes(self, result_codes: List[int]) -> bytes:
+    def pack_codes_into_bytes(self, codes: List[int]) -> bytes:
         """
-        Convert the list of result codes into a continuous byte stream, with codes packed as per the current bit-width.
+        Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width.
         The bit-width starts at 9 bits and expands as needed.
         """
-        bits_per_code = 9  # Initially, the code length shall be 9 bits
-        max_code = 1 << bits_per_code  # 512
-        buffer = 0  # Temporary storage for bits to be packed into bytes
-        bits_in_buffer = 0  # Number of bits currently in the buffer
-        output = []
-
-        for code in result_codes:
+        bits_per_code = self.INITIAL_BITS_PER_CODE
+        max_bits_per_code = self.MAX_BITS_PER_CODE
+        max_code_value = (1 << bits_per_code) - 1
+        next_code = self.EOD_MARKER + 1
+        buffer = 0
+        bits_in_buffer = 0
+        output = bytearray()
+
+        for code in codes:
             buffer = (buffer << bits_per_code) | code
             bits_in_buffer += bits_per_code
 
@@ -84,16 +96,21 @@ def pack_codes_into_bytes(self, result_codes: List[int]) -> bytes:
                 bits_in_buffer -= 8
                 output.append((buffer >> bits_in_buffer) & 0xFF)
 
-            # After a clear-table marker, reset to 9-bit codes
+            # After a clear-table marker, reset bits_per_code and next_code
             if code == self.CLEAR_TABLE_MARKER:
-                bits_per_code = 9
-                max_code = 1 << bits_per_code
-                continue
-
-            # Expand the code width if the next code exceeds the current range
-            if code >= max_code - 1 and bits_per_code < self.MAX_CODE_WIDTH:
-                bits_per_code += 1
-                max_code <<= 1  # Double the range for the new bit-width
+                bits_per_code = self.INITIAL_BITS_PER_CODE
+                max_code_value = (1 << bits_per_code) - 1
+                next_code = self.EOD_MARKER + 1
+            elif code == self.EOD_MARKER:
+                # Do not increment next_code for EOD_MARKER
+                pass
+            else:
+                # Increase next_code after processing each code (except special codes)
+                next_code += 1
+                # Increase bits_per_code if necessary
+                if next_code > max_code_value and bits_per_code < max_bits_per_code:
+                    bits_per_code += 1
+                    max_code_value = (1 << bits_per_code) - 1
 
         # Flush any remaining bits in the buffer
         if bits_in_buffer > 0:

From 59ec6b95b61838df4cb5abc06f825d5f1a7c98fc Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Sep 2024 10:25:57 +0200
Subject: [PATCH 06/10] Code reuse

---
 pypdf/lzw.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/pypdf/lzw.py b/pypdf/lzw.py
index 133277b3b..67054c75b 100644
--- a/pypdf/lzw.py
+++ b/pypdf/lzw.py
@@ -16,7 +16,7 @@ def __init__(self) -> None:
         self.clear_table()
 
     def clear_table(self) -> None:
-        """Reset the encoding table to initial state with single-byte sequences."""
+        """Reset the encoding table and coding state to initial conditions."""
         self.table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
         self.next_code = self.EOD_MARKER + 1
         self.bits_per_code = self.INITIAL_BITS_PER_CODE
@@ -77,17 +77,15 @@ def pack_codes_into_bytes(self, codes: List[int]) -> bytes:
         Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width.
         The bit-width starts at 9 bits and expands as needed.
         """
-        bits_per_code = self.INITIAL_BITS_PER_CODE
-        max_bits_per_code = self.MAX_BITS_PER_CODE
-        max_code_value = (1 << bits_per_code) - 1
-        next_code = self.EOD_MARKER + 1
+        # Reset coding state
+        self.clear_table()
         buffer = 0
         bits_in_buffer = 0
         output = bytearray()
 
         for code in codes:
-            buffer = (buffer << bits_per_code) | code
-            bits_in_buffer += bits_per_code
+            buffer = (buffer << self.bits_per_code) | code
+            bits_in_buffer += self.bits_per_code
 
             # Codes shall be packed into a continuous bit stream, high-order bit
             # first. This stream shall then be divided into bytes, high-order bit
@@ -96,21 +94,22 @@ def pack_codes_into_bytes(self, codes: List[int]) -> bytes:
                 bits_in_buffer -= 8
                 output.append((buffer >> bits_in_buffer) & 0xFF)
 
-            # After a clear-table marker, reset bits_per_code and next_code
+            # After a clear-table marker, reset coding state
             if code == self.CLEAR_TABLE_MARKER:
-                bits_per_code = self.INITIAL_BITS_PER_CODE
-                max_code_value = (1 << bits_per_code) - 1
-                next_code = self.EOD_MARKER + 1
+                self.clear_table()
             elif code == self.EOD_MARKER:
                 # Do not increment next_code for EOD_MARKER
                 pass
             else:
                 # Increase next_code after processing each code (except special codes)
-                next_code += 1
+                self.next_code += 1
                 # Increase bits_per_code if necessary
-                if next_code > max_code_value and bits_per_code < max_bits_per_code:
-                    bits_per_code += 1
-                    max_code_value = (1 << bits_per_code) - 1
+                if (
+                    self.next_code > self.max_code_value
+                    and self.bits_per_code < self.MAX_BITS_PER_CODE
+                ):
+                    self.bits_per_code += 1
+                    self.max_code_value = (1 << self.bits_per_code) - 1
 
         # Flush any remaining bits in the buffer
         if bits_in_buffer > 0:

From 42a2d981297f9a2efda2bf35fe7ee32a33caa864 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Sep 2024 10:46:14 +0200
Subject: [PATCH 07/10] Add ABC

---
 pypdf/{lzw.py => codecs.py}           | 44 +++++++++++++++++++++++++--
 tests/{test_lzw.py => test_codecs.py} | 10 ++----
 2 files changed, 44 insertions(+), 10 deletions(-)
 rename pypdf/{lzw.py => codecs.py} (83%)
 rename tests/{test_lzw.py => test_codecs.py} (96%)

diff --git a/pypdf/lzw.py b/pypdf/codecs.py
similarity index 83%
rename from pypdf/lzw.py
rename to pypdf/codecs.py
index 67054c75b..d6858d095 100644
--- a/pypdf/lzw.py
+++ b/pypdf/codecs.py
@@ -1,9 +1,43 @@
-"""Lempel-Ziv-Welch (LZW) adaptive compression method."""
+"""
+This module is for codecs only.
 
+While the codec implementation can contain details of the PDF specification,
+the module should not do any PDF parsing.
+"""
+
+from abc import ABC, abstractmethod
 from typing import Dict, List
 
 
-class LzwCodec:
+class Codec(ABC):
+    """Abstract base class for all codecs."""
+
+    @abstractmethod
+    def encode(self, data: bytes) -> bytes:
+        """
+        Encode the input data.
+
+        Args:
+            data: Data to encode.
+
+        Returns:
+            Encoded data.
+        """
+
+    @abstractmethod
+    def decode(self, data: bytes) -> bytes:
+        """
+        Decode the input data.
+
+        Args:
+            data: Data to decode.
+
+        Returns:
+            Decoded data.
+        """
+
+
+class LzwCodec(Codec):
     """Lempel-Ziv-Welch (LZW) adaptive compression codec."""
 
     CLEAR_TABLE_MARKER = 256  # Special code to indicate table reset
@@ -116,3 +150,9 @@ def pack_codes_into_bytes(self, codes: List[int]) -> bytes:
             output.append((buffer << (8 - bits_in_buffer)) & 0xFF)
 
         return bytes(output)
+
+    def decode(self, data: bytes) -> bytes:
+        """Decode data using LZW."""
+        from .filters import LZWDecode
+
+        return LZWDecode._decodeb(data)
diff --git a/tests/test_lzw.py b/tests/test_codecs.py
similarity index 96%
rename from tests/test_lzw.py
rename to tests/test_codecs.py
index 16398a3b8..f96e543a5 100644
--- a/tests/test_lzw.py
+++ b/tests/test_codecs.py
@@ -2,8 +2,7 @@
 
 import pytest
 
-from pypdf.filters import LZWDecode
-from pypdf.lzw import LzwCodec
+from pypdf.codecs import LzwCodec
 
 test_cases = [
     pytest.param(b"", id="Empty input"),
@@ -25,10 +24,5 @@ def test_encode_decode(data):
     """Decoder and encoder match."""
     codec = LzwCodec()
     compressed_data = codec.encode(data)
-    decoded = LZWDecode._decodeb(compressed_data)
-    msg = f"len(data)={len(data)}, len(decoded)={len(decoded)}, len(compressed_data)={len(compressed_data)}"
-    if decoded != data:
-        for i in range(len(data)):
-            assert decoded[i] == data[i], msg
-
+    decoded = codec.decode(compressed_data)
     assert decoded == data

From 1ab81632b0cbdba0c80cc3e79b0c1a9f98e90b21 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Sep 2024 12:55:18 +0200
Subject: [PATCH 08/10] pass data to init

---
 pypdf/codecs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/codecs.py b/pypdf/codecs.py
index d6858d095..0f2a34df1 100644
--- a/pypdf/codecs.py
+++ b/pypdf/codecs.py
@@ -155,4 +155,4 @@ def decode(self, data: bytes) -> bytes:
         """Decode data using LZW."""
         from .filters import LZWDecode
 
-        return LZWDecode._decodeb(data)
+        return LZWDecode.Decoder(data).decode()

From 5d1248d64cbbcd1dd299863d5e64a16a92e23b21 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Sep 2024 14:23:06 +0200
Subject: [PATCH 09/10] Make it private

---
 pypdf/{codecs.py => _codecs/_codecs.py} | 2 +-
 tests/test_codecs.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename pypdf/{codecs.py => _codecs/_codecs.py} (99%)

diff --git a/pypdf/codecs.py b/pypdf/_codecs/_codecs.py
similarity index 99%
rename from pypdf/codecs.py
rename to pypdf/_codecs/_codecs.py
index 0f2a34df1..ce2659548 100644
--- a/pypdf/codecs.py
+++ b/pypdf/_codecs/_codecs.py
@@ -153,6 +153,6 @@ def pack_codes_into_bytes(self, codes: List[int]) -> bytes:
 
     def decode(self, data: bytes) -> bytes:
         """Decode data using LZW."""
-        from .filters import LZWDecode
+        from ..filters import LZWDecode
 
         return LZWDecode.Decoder(data).decode()
diff --git a/tests/test_codecs.py b/tests/test_codecs.py
index f96e543a5..8b5626fed 100644
--- a/tests/test_codecs.py
+++ b/tests/test_codecs.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from pypdf.codecs import LzwCodec
+from pypdf._codecs._codecs import LzwCodec
 
 test_cases = [
     pytest.param(b"", id="Empty input"),

From 65647683cf6dc73ee8415b554ab6da59d50b7932 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Sep 2024 14:29:40 +0200
Subject: [PATCH 10/10] Test encoded value

---
 tests/test_codecs.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/test_codecs.py b/tests/test_codecs.py
index 8b5626fed..5113cca2c 100644
--- a/tests/test_codecs.py
+++ b/tests/test_codecs.py
@@ -26,3 +26,18 @@ def test_encode_decode(data):
     compressed_data = codec.encode(data)
     decoded = codec.decode(compressed_data)
     assert decoded == data
+
+
+@pytest.mark.parametrize(
+    ("plain", "expected_encoded"),
+    [
+        (b"", b"\x80@@"),
+        (b"A", b"\x80\x10` "),
+        (b"AAAAAA", b"\x80\x10`P8\x08"),
+        (b"Hello, World!", b"\x80\x12\x0c\xa6\xc3a\xbcX +\x9b\xceF\xc3 \x86\x02"),
+    ],
+)
+def test_encode_lzw(plain, expected_encoded):
+    codec = LzwCodec()
+    actual_encoded = codec.encode(plain)
+    assert actual_encoded == expected_encoded