From 8a34e3f9d0393d25310936db48e0a742b8c8367f Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 28 Sep 2024 13:13:18 +0200 Subject: [PATCH 01/10] TST: Add lzw.lzw_encode --- pypdf/lzw.py | 103 ++++++++++++++++++++++++++++++++++++++++++++++ tests/test_lzw.py | 24 +++++++++++ 2 files changed, 127 insertions(+) create mode 100644 pypdf/lzw.py create mode 100644 tests/test_lzw.py diff --git a/pypdf/lzw.py b/pypdf/lzw.py new file mode 100644 index 000000000..43b7c8d1b --- /dev/null +++ b/pypdf/lzw.py @@ -0,0 +1,103 @@ +"""Lempel-Ziv-Welch (LZW) adaptive compression method.""" + +from typing import List + +CLEAR_TABLE_MARKER = 256 +EOD_MARKER = 257 + +# Data encoded using the LZW compression method shall consist of +# a sequence of codes that are 9 to 12 bits long +MAX_CODE_WIDTH = 12 + + +def lzw_encode(data: bytes) -> bytes: + """ + Encode byte data with LZW compression. + + Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding". + """ + max_table_size = 1 << MAX_CODE_WIDTH # 4096 + + # the 258 fixed codes + table = {bytes([i]): i for i in range(256)} + + next_code = 258 + result_codes = [] + + # The encoder shall begin by issuing a clear-table code + result_codes.append(CLEAR_TABLE_MARKER) + + string = b"" + for int_character in data: + character = bytes([int_character]) + if string + character in table: + # Accumulate a sequence of one or more input characters + # matching a sequence already present in the table. + # For maximum compression, the encoder looks for the longest + # such sequence. + string += character + else: + # Emit the code corresponding to that sequence. + result_codes.append(table[string]) + + # Before adding a new entry, check if the table is full + if len(table) >= max_table_size: + # Table is full, emit clear-table code and reset + result_codes.append(CLEAR_TABLE_MARKER) + table = {bytes([i]): i for i in range(256)} + next_code = 258 + # bits_per_code will be reset in pack_codes_into_bytes + else: + # Add new sequence to the table + table[string + character] = next_code + next_code += 1 + + string = character + + # Ensure everything actually is encoded + if string: + result_codes.append(table[string]) + + result_codes.append(EOD_MARKER) + + return pack_codes_into_bytes(result_codes) + + +def pack_codes_into_bytes(result_codes: List[int]) -> bytes: + """Convert the result code list into bytes.""" + bits_per_code = 9 # Initially, the code length shall be 9 bits + max_code = 1 << bits_per_code # 512 + buffer = 0 + bits_in_buffer = 0 + output = [] + + for code in result_codes: + buffer = (buffer << bits_per_code) | code + bits_in_buffer += bits_per_code + + # Codes shall be packed into a continuous bit stream, high-order bit + # first. This stream shall then be divided into bytes, high-order bit + # first. + while bits_in_buffer >= 8: + bits_in_buffer -= 8 + output.append((buffer >> bits_in_buffer) & 0xFF) + + # Handle bits_per_code reset after clear-table code + if code == CLEAR_TABLE_MARKER: + bits_per_code = 9 + max_code = 1 << bits_per_code + continue + + # Whenever both the encoder and the decoder independently (but + # synchronously) realize that the current code length is no longer + # sufficient to represent the number of entries in the table, they shall + # increase the number of bits per code by 1. + if code >= max_code - 1 and bits_per_code < MAX_CODE_WIDTH: + bits_per_code += 1 + max_code <<= 1 + + # Flush the buffer + if bits_in_buffer > 0: + output.append((buffer << (8 - bits_in_buffer)) & 0xFF) + + return bytes(output) diff --git a/tests/test_lzw.py b/tests/test_lzw.py new file mode 100644 index 000000000..565846eeb --- /dev/null +++ b/tests/test_lzw.py @@ -0,0 +1,24 @@ +"""Test LZW-related code.""" + +import pytest + +from pypdf.filters import LZWDecode +from pypdf.lzw import lzw_encode + +test_cases = [ + pytest.param(b"", id="Empty input"), # Empty input + pytest.param(b"A", id="Single character"), + pytest.param(b"AAAAAA", id="Repeating character"), + pytest.param(b"Hello, World!", id="Simple text"), + pytest.param(b"ABABABABABAB", id="Repeating pattern"), + pytest.param(b"The quick brown fox jumps over the lazy dog", id="Longer text"), + pytest.param(b"\x00\xFF\x00\xFF", id="Binary data"), +] + + +@pytest.mark.parametrize("data", test_cases) +def test_encode_decode(data): + """Decoder and encoder match.""" + compressed_data = lzw_encode(data) + decoded = LZWDecode._decodeb(compressed_data) + assert decoded == data From 609cd7a96952d0fb5354c9a1856bf7eb57d3498a Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 28 Sep 2024 14:37:04 +0200 Subject: [PATCH 02/10] Add failing test --- tests/test_lzw.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_lzw.py b/tests/test_lzw.py index 565846eeb..100219394 100644 --- a/tests/test_lzw.py +++ b/tests/test_lzw.py @@ -6,13 +6,17 @@ from pypdf.lzw import lzw_encode test_cases = [ - pytest.param(b"", id="Empty input"), # Empty input + pytest.param(b"", id="Empty input"), pytest.param(b"A", id="Single character"), pytest.param(b"AAAAAA", id="Repeating character"), pytest.param(b"Hello, World!", id="Simple text"), pytest.param(b"ABABABABABAB", id="Repeating pattern"), pytest.param(b"The quick brown fox jumps over the lazy dog", id="Longer text"), pytest.param(b"\x00\xFF\x00\xFF", id="Binary data"), + pytest.param( + b"BBBCBDBEBFBGBHBIBJBKBLBMBNBOBPBQBRBSBTBUBVBWBXBYBZB[B\\B]B^B_B`BaBbBcBdBeBfBgBhBiBjBkBlBmBnBoBpBqBrBsBtBuBvBwBxByCBCCCDCECFCGCHCICJCKCLCMCNCOCPCQCRCSCTCUCVCWCXCYCZC[C\\C]C^C_C`CaCbCcCdCeCfCgChCiCjCkClCmCnCoCpCqCrCsCtCuCvCwCxCyDBDCDDDEDFDGDHDIDJDKDLDMDNDODPDQDRDSDTDUDVDWDXDYDZD[D\\D]D^D_D`DaDbDcDdDeDfDgDhDiDjDkDlDmDnDoDpDqDrDsDtDuDvDwDxDyEBECEDEEEFEGEHEIEJEKELEMENEOEPEQERESETEUEVEWEXEYEZE[E\\E]E^E_E`EaEbEcEdEeEfEgEhEiEjEkElEmEnEoEpEqErEsEtEuEvEwExEyFBFCFDFEFFFGFHFIFJFKFLFMFNFOFPFQFRFSFTFUFVFWFXFYFZF[F\\F]F^F_F`FaFbFcFdFeFfFgFhFiFjFkFlFmFnFoFpFqFrFsFtFuFvFwFxFyGBGCGDGEGFGGGHGIGJGKGLGMGNGOGPGQGRGSGTGUGVGWGXGYGZG[G\\G]G^G_G`GaGbGcGdGeGfGgGhGiGjGkGlGmGnGoGpGqGrGsGtGuGvGwGxGyHBHCHDHEHFHGHHHIHJHKHLHMHNHOHPHQHRHSHTHUHVHWHXHYHZH[H\\H]H^H_H`HaHbHcHdHeHfHgHhHiHjHkHlHmHnHoHpHqHrHsHtHuHvHwHxHyIBICIDIEIFIGIHIIIJIKILIMINIOIPIQIRISITIUIVIWIXIYIZI[I\\I]I^I_I`IaIbIcIdIeIfIgIhIiIjIkIlImInIoIpIqIrIsItIuIvIwIxIyJBJCJDJEJFJGJHJIJJJKJLJMJNJOJPJQJRJSJTJUJVJWJXJYJZJ[J\\J]J^J_J`JaJbJcJdJeJfJgJhJiJjJkJlJmJnJoJpJqJrJsJtJuJvJwJxJyKBKCKDKEKFKGKHKIKJKKKLKMKNKOKPKQKRKSKTKUKVKWKXKYKZK[K\\K]K^K_K`KaKbKcKdKeKfKgKhKiKjKkKlKmKnKoKpKqKrKsKtKuKvKwKxKyLBLCLDLELFLGLHLILJLKLLLMLNLOLPLQLRLSLTLULVLWLXLYLZL[L\\L]L^L_L`LaLbLcLdLeLfLgLhLiLjLkLlLmLnLoLpLqLrLsLtLuLvLwLxLyMBMCMDMEMFMGMHMIMJMKMLMMMNMOMPMQMRMSMTMUMVMWMXMYMZM[M\\M]M^M_M`MaMbMcMdMeMfMgMhMiMjMkMlMmMnMoMpMqMrMsMtMuMvMwMxMyNBNCNDNENFNGNHNINJNKNLNMNNNONPNQNRNSNTNUNVNWNXNYNZN[N\\N]N^N_N`NaNbNcNdNeNfNgNhNiNjNkNlNmNnNoNpNqNrNsNtNuNvNwNxNyOBOCODOEOFOGOHOIOJOKOLOMONOOOPOQOROSOTOUOVOWOXOYOZO[O\\O]O^O_O`OaObOcOdOeOfOgOhOiOjOkOlOmOnOoOpOqOrOsOtOuOvOwOxOyPBPCPDPEPFPGPHPIPJPKPLPMPNPOPPPQPRPSPTPUPVPWPXPYPZP[P\\P]P^P_P`PaPbPcPdPePfPgPhPiPjPkPlPmPnPoPpPqPrPsPtPuPvPwPxPyQBQCQDQEQFQGQHQIQJQKQLQMQNQOQPQQQRQSQTQUQVQWQXQYQZQ[Q\\Q]Q^Q_Q`QaQbQcQdQeQfQgQhQiQjQkQlQmQnQoQpQqQrQsQtQuQvQwQxQyRBRCRDRERFRGRHRIRJRKRLRMRNRORPRQRRRSRTRURVRWRXRYRZR[R\\R]R^R_R`RaRbRcRdReRfRgRhRiRjRkRlRmRnRoRpRqRrRsRtRuRvRwRxRySBSCSDSESFSGSHSISJSKSLSMSNSOSPSQSRSSSTSUSVSWSXSYSZS[S\\S]S^S_S`SaSbScSdSeSfSgShSiSjSkSlSmSnSoSpSqSrSsStSuSvSwSxSyTBTCTDTETFTGTHTITJTKTLTMTNTOTPTQTRTSTTTUTVTWTXTYTZT[T\\T]T^T_T`TaTbTcTdTeTfTgThTiTjTkTlTmTnToTpTqTrTsTtTuTvTwTxTyUBUCUDUEUFUGUHUIUJUKULUMUNUOUPUQURUSUTUUUVUWUXUYUZU[U\\U]U^U_U`UaUbUcUdUeUfUgUhUiUjUkUlUmUnUoUpUqUrUsUtUuUvUwUxUyVBVCVDVEVFVGVHVIVJVKVLVMVNVOVPVQVRVSVTVUVVVWVXVYVZV[V\\V]V^V_V`VaVbVcVdVeVfVgVhViVjVkVlVmVnVoVpVqVrVsVtVuVvVwVxVyWBWCWDWEWFWGWHWIWJWKWLWMWNWOWPWQWRWSWTWUWVWWWXWYWZW[W\\W]W^W_W`WaWbWcWdWeWfWgWhWiWjWkWlWmWnWoWpWqWrWsWtWuWvWwWxWyXBXCXDXEXFXGXHXIXJXKXLXMXNXOXPXQXRXSXTXUXVXWXXXYXZX[X\\X]X^X_X`XaXbXcXdXeXfXgXhXiXjXkXlXmXnXoXpXqXrXsXtXuXvXwXxXyYBYCYDYEYFYGYHYIYJYKYLYMYNYOYPYQYRYSYTYUYVYWYXYYYZY[Y\\Y]Y^Y_Y`YaYbYcYdYeYfYgYhYiYjYkYlYmYnYoYpYqYrYsYtYuYvYwYxYyZBZCZDZEZFZGZHZIZJZKZLZMZNZOZPZQZRZSZTZUZVZWZXZYZZZ[Z\\Z]Z^Z_Z`ZaZbZcZdZeZfZgZhZiZjZkZlZmZnZoZpZqZrZsZtZuZvZwZxZy[B[C[D[E[F[G[H[I[J[K[L[M[N[O[P[Q[R[S[T[U[V[W[X[Y[Z[[[\\[][^[_[`[a[b[c[d[e[f[g[h[i[j[k[l[m[n[o[p[q[r[s[t[u[v[w[x[y\\B\\C\\D\\E\\F\\G\\H\\I\\J\\K\\L\\M\\N\\O\\P\\Q\\R\\S\\T\\U\\V\\W\\X\\Y\\Z\\[\\\\\\]\\^\\_\\`\\a\\b\\c\\d\\e\\f\\g\\h\\i\\j\\k\\l\\m\\n\\o\\p\\q\\r\\s\\t\\u\\v\\w\\x\\y]B]C]D]E]F]G]H]I]J]K]L]M]N]O]P]Q]R]S]T]U]V]W]X]Y]Z][]\\]]]^]_]`]a]b]c]d]e]f]g]h]i]j]k]l]m]n]o]p]q]r]s]t]u]v]w]x]y^B^C^D^E^F^G^H^I^J^K^L^M^N^O^P^Q^R^S^T^U^V^W^X^Y^Z^[^\\^]^^^_^`^a^b^c^d^e^f^g^h^i^j^k^l^m^n^o^p^q^r^s^t^u^v^w^x^y_B_C_D_E_F_G_H_I_J_K_L_M_N_O_P_Q_R_S_T_U_V_W_X_Y_Z_[_\\_]_^___`_a_b_c_d_e_f_g_h_i_j_k_l_m_n_o_p_q_r_s_t_u_v_w_x_y`B`C`D`E`F`G`H`I`J`K`L`M`N`O`P`Q`R`S`T`U`V`W`X`Y`Z`[`\\`]`^`_```a`b`c`d`e`f`g`h`i`j`k`l`m`n`o`p`q`r`s`t`u`v`w`x`yaBaCaDaEaFaGaHaIaJaKaLaMaNaOaPaQaRaSaTaUaVaWaXaYaZa[a\\a]a^a_a`aaabacadaeafagahaiajakalamanaoapaqarasatauavawaxaybBbCbDbEbFbGbHbIbJbKbLbMbNbObPbQbRbSbTbUbVbWbXbYbZb[b\\b]b^b_b`babbbcbdbebfbgbhbibjbkblbmbnbobpbqbrbsbtbubvbwbxbycBcCcDcEcFcGcHcIcJcKcLcMcNcOcPcQcRcScTcUcVcWcXcYcZc[c\\c]c^c_c`cacbcccdcecfcgchcicjckclcmcncocpcqcrcsctcucvcwcxcydBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdwdxdyeBeCeDeEeFeGeHeIeJeKeLeMeNeOePeQeReSeTeUeVeWeXeYeZe[e\\e]e^e_e`eaebecedeeefegeheiejekelemeneoepeqereseteuevewexeyfBfCfDfEfFfGfHfIfJfKfLfMfNfOfPfQfRfSfTfUfVfWfXfYfZf[f\\f]f^f_f`fafbfcfdfefffgfhfifjfkflfmfnfofpfqfrfsftfufvfwfxfygBgCgDgEgFgGgHgIgJgKgLgMgNgOgPgQgRgSgTgUgVgWgXgYgZg[g\\g]g^g_g`gagbgcgdgegfggghgigjgkglgmgngogpgqgrgsgtgugvgwgxgyhBhChDhEhFhGhHhIhJhKhLhMhNhOhPhQhRhShThUhVhWhXhYhZh[h\\h]h^h_h`hahbhchdhehfhghhhihjhkhlhmhnhohphqhrhshthuhvhwhxhyiBiCiDiEiFiGiHiIiJiKiLiMiNiOiPiQiRiSiTiUiViWiXiYiZi[i\\i]i^i_i`iaibicidieifigihiiijikiliminioipiqirisitiuiviwixiyjBjCjDjEjFjGjHjIjJjKjLjMjNjOjPjQjRjSjTjUjVjWjXjYjZj[j\\j]j^j_j`jajbjcjdjejfjgjhjijjjkjljmjnjojpjqjrjsjtjujvjwjxjykBkCkDkEkFkGkHkIkJkKkLkMkNkOkPkQkRkSkTkUkVkWkXkYkZk[k\\k]k^k_k`kakbkckdkekfkgkhkikjkkklkmknkokpkqkrksktkukvkwkxkylBlClDlElFlGlHlIlJlKlLlMlNlOlPlQlRlSlTlUlVlWlXlYlZl[l\\l]l^l_l`lalblcldlelflglhliljlklllmlnlolplqlrlsltlulvlwlxlymBmCmDmEmFmGmHmImJmKmLmMmNmOmPmQmRmSmTmUmVmWmXmYmZm[m\\m]m^m_m`mambmcmdmemfmgmhmimjmkmlmmmnmompmqmrmsmtmumvmwmxmynBnCnDnEnFnGnHnInJnKnLnMnNnOnPnQnRnSnTnUnVnWnXnYnZn[n\\n]n^n_n`nanbncndnenfngnhninjnknlnmnnnonpnqnrnsntnunvnwnxnyoBoCoDoEoFoGoHoIoJoKoLoMoNoOoPoQoRoSoToUoVoWoXoYoZo[o\\o]o^o_o`oaobocodoeofogohoiojokolomonooopoqorosotouovowoxoypBpCpDpEpFpGpHpIpJpKpLpMpNpOpPpQpRpSpTpUpVpWpXpYpZp[p\\p]p^p_p`papbpcpdpepfpgphpipjpkplpmpnpopppqprpsptpupvpwpxpyqBqCqDqEqFqGqHqIqJqKqLqMqNqOqPqQqRqSqTqUqVqWqXqYqZq[q\\q]q^q_q`qaqbqcqdqeqfqgqhqiqjqkqlqmqnqoqpqqqrqsqtquqvqwqxqyrBrCrDrErFrGrHrIrJrKrLrMrNrOrPrQrRrSrTrUrVrWrXrYrZr[r\\r]r^r_r`rarbrcrdrerfrgrhrirjrkrlrmrnrorprqrrrsrtrurvrwrxrysBsCsDsEsFsGsHsIsJsKsLsMsNsOsPsQsRsSsTsUsVsWsXsYsZs[s\\s]s^s_s`sasbscsdsesfsgshsisjskslsmsnsospsqsrssstsusvswsxsytBtCtDtEtFtGtHtItJtKtLtMtNtOtPtQtRtStTtUtVtWtXtYtZt[t\\t]t^t_t`tatbtctdtetftgthtitjtktltmtntotptqtrtstttutvtwtxtyuBuCuDuEuFuGuHuIuJuKuLuMuNuOuPuQuRuSuTuUuVuWuXuYuZu[u\\u]u^u_u`uaubucudueufuguhuiujukulumunuoupuqurusutuuuvuwuxuyvBvCvDvEvFvGvHvIvJvKvLvMvNvOvPvQvRvSvTvUvVvWvXvYvZv[v\\v]v^v_v`vavbvcvdvevfvgvhvivjvkvlvmvnvovpvqvrvsvtvuvvvwvxvywBwCwDwEwFwGwHwIwJwKwLwMwNwOwPwQwRwSwTwUwVwWwXwYwZw[w\\w]w^w_w`wawbwcwdwewfwgwhwiwjwkwlwmwnwowpwqwrwswtwuwvwwwxwyxBxCxDxExFxGxHxIxJxKxLxMxNxOxPxQxRxSxTxUxVxWxXxYxZx[x\\x]x^x_x`xaxbxcxdxexfxgxhxixjxkxlxmxnxoxpxqxrxsxtxuxvxwxxxyyByCyDyEyFyGyHyIyJyKyLyMyNyOyPyQyRySyTyUyVyWyXyYyZy[y\\y]y^y_y`yaybycydyeyfygyhyiyjykylymynyoypyqyrysytyuyvywyxyyBBBBBCBBDBBEBBFBBGBBHBBIBBJBBKBBLBBMBBNBBOBBPBBQBBRBBSBBTBBUBBVBBWBBXBBYBBZBB[BB\\BB]BB^BB_BB`BBaBBbBBcBBdBBeBBfBBgBBhBBiBBjBBkBBlBBmBBnBBoBBpBBqBBrBBsBBtBBuBBvBBwBBxBByBCBBCCBCDBCEBCFBCGBCHBCIBCJBCKBCLBCMBCNBCOBCPBCQBCRBCSBCTBCUBCVBCWBCXBCYBCZBC[BC\\BC]BC^BC_BC`BCaBCbBCcBCdBCeBCfBCgBChBCiBCjBCkBClBCmBCnBCoBCpBCqBCrBCsBCtBCuBCvBCwBCxBCyBDBBDCBDDBDEBDFBDGBDHBDIBDJBDKBDLBDMBDNBDOBDPBDQBDRBDSBDTBDUBDVBDWBDXBDYBDZBD[BD\\BD]BD^BD_BD`BDaBDbBDcBDdBDeBDfBDgBDhBDiBDjBDkBDlBDmBDnBDoBDpBDqBDrBDsBDtBDuBDvBDwBDxBDyBEBBECBEDBEEBEFBEGBEHBEIBEJBEKBELBEMBENBEOBEPBEQBERBESBETBEUBEVBEWBEXBEYBEZBE[BE\\BE]BE^BE_BE`BEaBEbBEcBEdBEeBEfBEgBEhBEiBEjBEkBElBEmBEnBEoBEpBEqBErBEsBEtBEuBEvBEwBExBEyBFBBFCBFDBFEBFFBFGBFHBFIBFJBFKBFLBFMBFNBFOBFPBFQBFRBFSBFTBFUBFVBFWBFXBFYBFZBF[BF\\BF]BF^BF_BF`BFaBFbBFcBFdBFeBFfBFgBFhBFiBFjBFkBFlBFmBFnBFoBFpBFqBFrBFsBFtBFuBFvBFwBFxBFyBGBBGCBGDBGEBGFBGGBGHBGIBGJBGKBGLBGMBGNBGOBGPBGQBGRBGSBGTBGUBGVBGWBGXBGYBGZBG[BG\\BG]BG^BG_BG`BGaBGbBGcBGdBGeBGfBGgBGhBGiBGjBGkBGlBGmBGnBGoBGpBGqBGrBGsBGtBGuBGvBGwBGxBGyBHBBHCBHDBHEBHFBHGBHHBHIBHJBHKBHLBHMBHNBHOBHPBHQBHRBHSBHTBHUBHVBHWBHXBHYBHZBH[BH\\BH]BH^BH_BH`BHaBHbBHcBHdBHeBHfBHgBHhBHiBHjBHkBHlBHmBHnBHoBHpBHqBHrBHsBHtBHuBHvBHwBHxBHyBIBBICBIDBIEBIFBIGBIHBIIBIJBIKBILBIMBINBIOBIPBIQBIRBISBITBIUBIVBIWBIXBIYBIZBI[BI\\BI]BI^BI_BI`BIaBIbBIcBIdBIeBIfBIgBIhBIiBIjBIkBIlBImBInBIoBIpBIqBIrBIsBItBIuBIvBIwBIxBIyBJBBJCBJDBJEBJFBJGBJHBJIBJJBJKBJLBJMBJNBJOBJPBJQBJRBJSBJTBJUBJVBJWBJXBJYBJZBJ[BJ\\BJ]BJ^BJ_BJ`BJaBJbBJcBJdBJeBJfBJgBJhBJiBJjBJkBJlBJmBJnBJoBJpBJqBJrBJsBJtBJuBJvBJwBJxBJyBKBBKCBKDBKEBKFBKGBKHBKIBKJBKKBKLBKMBKNBKOBKPBKQBKRBKSBKTBKUBKVBKWBKXBKYBKZBK[BK\\BK]BK^BK_BK`BKaBKbBKcBKdBKeBKfBKgBKhBKiBKjBKkBKlBKmBKnBKoBKpBKqBKrBKsBKtBKuBKvBKwBKxBKyBLBBLCBLDBLEBLFBLGBLHBLIBLJBLKBLLBLMBLNBLOBLPBLQBLRBLSBLTBLUBLVBLWBLXBLYBLZBL[BL\\BL]BL^BL_BL`BLaBLbBLcBLdBLeBLfBLgBLhBLiBLjBLkBLlBLmBLnBLoBLpBLqBLrBLsBLtBLuBLvBLwBLxBLyBMBBMCBMDBMEBMFBMGBMHBMIBMJBMKBMLBMMBMNBMOBMPBMQBMRBMSBMTBMUBMVBMWBMXBMYBMZBM[BM\\BM]BM^BM_BM`BMaBMbBMcBMdBMeBMfBMgBMhBMiBMjBMkBMlBMmBMnBMoBMpBMqBMrBMsBMtBMuBMvBMwBMxBMyBNBBNCBNDBNEBNFBNGBNHBNIBNJBNKBNLBNMBNNBNOBNPBNQBNRBNSBNTBNUBNVBNWBNXBNYBNZBN[BN\\BN]BN^BN_", + id="Table overflow", + ), ] @@ -21,4 +25,9 @@ def test_encode_decode(data): """Decoder and encoder match.""" compressed_data = lzw_encode(data) decoded = LZWDecode._decodeb(compressed_data) + msg = f"len(data)={len(data)}, len(decoded)={len(decoded)}, len(compressed_data)={len(compressed_data)}" + if decoded != data: + for i in range(len(data)): + assert decoded[i] == data[i], msg + assert decoded == data From 32c18bca701b740a5492f3b6604eba25d3d874ba Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Sep 2024 10:00:35 +0200 Subject: [PATCH 03/10] Use Codec-class --- pypdf/lzw.py | 200 ++++++++++++++++++++++++++------------------------- 1 file changed, 102 insertions(+), 98 deletions(-) diff --git a/pypdf/lzw.py b/pypdf/lzw.py index 43b7c8d1b..44b0fe817 100644 --- a/pypdf/lzw.py +++ b/pypdf/lzw.py @@ -2,102 +2,106 @@ from typing import List -CLEAR_TABLE_MARKER = 256 -EOD_MARKER = 257 - -# Data encoded using the LZW compression method shall consist of -# a sequence of codes that are 9 to 12 bits long -MAX_CODE_WIDTH = 12 - - -def lzw_encode(data: bytes) -> bytes: - """ - Encode byte data with LZW compression. - - Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding". - """ - max_table_size = 1 << MAX_CODE_WIDTH # 4096 - - # the 258 fixed codes - table = {bytes([i]): i for i in range(256)} - - next_code = 258 - result_codes = [] - - # The encoder shall begin by issuing a clear-table code - result_codes.append(CLEAR_TABLE_MARKER) - - string = b"" - for int_character in data: - character = bytes([int_character]) - if string + character in table: - # Accumulate a sequence of one or more input characters - # matching a sequence already present in the table. - # For maximum compression, the encoder looks for the longest - # such sequence. - string += character - else: - # Emit the code corresponding to that sequence. - result_codes.append(table[string]) - - # Before adding a new entry, check if the table is full - if len(table) >= max_table_size: - # Table is full, emit clear-table code and reset - result_codes.append(CLEAR_TABLE_MARKER) - table = {bytes([i]): i for i in range(256)} - next_code = 258 - # bits_per_code will be reset in pack_codes_into_bytes + +class LzwCodec: + CLEAR_TABLE_MARKER = 256 + EOD_MARKER = 257 + + # Data encoded using the LZW compression method shall consist of + # a sequence of codes that are 9 to 12 bits long + MAX_CODE_WIDTH = 12 + + def __init__(self) -> None: + self.clear_table() + + def clear_table(self) -> None: + """Clear the table.""" + # the 258 fixed codes + self.table = {bytes([i]): i for i in range(256)} + self.next_code = 258 + + def encode(self, data: bytes) -> bytes: + """ + Encode byte data with LZW compression. + + Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding". + """ + max_table_size = 1 << self.MAX_CODE_WIDTH # 4096 + + result_codes = [] + + # The encoder shall begin by issuing a clear-table code + result_codes.append(self.CLEAR_TABLE_MARKER) + + string = b"" + for int_character in data: + character = bytes([int_character]) + if string + character in self.table: + # Accumulate a sequence of one or more input characters + # matching a sequence already present in the table. + # For maximum compression, the encoder looks for the longest + # such sequence. + string += character else: - # Add new sequence to the table - table[string + character] = next_code - next_code += 1 - - string = character - - # Ensure everything actually is encoded - if string: - result_codes.append(table[string]) - - result_codes.append(EOD_MARKER) - - return pack_codes_into_bytes(result_codes) - - -def pack_codes_into_bytes(result_codes: List[int]) -> bytes: - """Convert the result code list into bytes.""" - bits_per_code = 9 # Initially, the code length shall be 9 bits - max_code = 1 << bits_per_code # 512 - buffer = 0 - bits_in_buffer = 0 - output = [] - - for code in result_codes: - buffer = (buffer << bits_per_code) | code - bits_in_buffer += bits_per_code - - # Codes shall be packed into a continuous bit stream, high-order bit - # first. This stream shall then be divided into bytes, high-order bit - # first. - while bits_in_buffer >= 8: - bits_in_buffer -= 8 - output.append((buffer >> bits_in_buffer) & 0xFF) - - # Handle bits_per_code reset after clear-table code - if code == CLEAR_TABLE_MARKER: - bits_per_code = 9 - max_code = 1 << bits_per_code - continue - - # Whenever both the encoder and the decoder independently (but - # synchronously) realize that the current code length is no longer - # sufficient to represent the number of entries in the table, they shall - # increase the number of bits per code by 1. - if code >= max_code - 1 and bits_per_code < MAX_CODE_WIDTH: - bits_per_code += 1 - max_code <<= 1 - - # Flush the buffer - if bits_in_buffer > 0: - output.append((buffer << (8 - bits_in_buffer)) & 0xFF) - - return bytes(output) + # Emit the code corresponding to that sequence. + result_codes.append(self.table[string]) + + # Before adding a new entry, check if the table is full + if len(self.table) >= max_table_size: + # Table is full, emit clear-table code and reset + result_codes.append(self.CLEAR_TABLE_MARKER) + self.clear_table() + # bits_per_code will be reset in pack_codes_into_bytes + else: + # Add new sequence to the table .. + self.table[string + character] = self.next_code + self.next_code += 1 + + string = character + + # Ensure everything actually is encoded + if string: + result_codes.append(self.table[string]) + + result_codes.append(self.EOD_MARKER) + + return self.pack_codes_into_bytes(result_codes) + + def pack_codes_into_bytes(self, result_codes: List[int]) -> bytes: + """Convert the result code list into bytes.""" + bits_per_code = 9 # Initially, the code length shall be 9 bits + max_code = 1 << bits_per_code # 512 + buffer = 0 + bits_in_buffer = 0 + output = [] + + for code in result_codes: + buffer = (buffer << bits_per_code) | code + bits_in_buffer += bits_per_code + + # Codes shall be packed into a continuous bit stream, high-order bit + # first. This stream shall then be divided into bytes, high-order bit + # first. + while bits_in_buffer >= 8: + bits_in_buffer -= 8 + output.append((buffer >> bits_in_buffer) & 0xFF) + + # Handle bits_per_code reset after clear-table code + if code == self.CLEAR_TABLE_MARKER: + bits_per_code = 9 + max_code = 1 << bits_per_code + continue + + # Whenever both the encoder and the decoder independently (but + # synchronously) realize that the current code length is no longer + # sufficient to represent the number of entries in the table, they shall + # increase the number of bits per code by 1. + if code >= max_code - 1 and bits_per_code < self.MAX_CODE_WIDTH: + bits_per_code += 1 + max_code <<= 1 + + # Flush the buffer + if bits_in_buffer > 0: + output.append((buffer << (8 - bits_in_buffer)) & 0xFF) + + return bytes(output) From fa6dfb3d4cb0e259d26eb088f80b7cd3d70182a3 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Sep 2024 10:12:13 +0200 Subject: [PATCH 04/10] Refinements --- pypdf/lzw.py | 75 ++++++++++++++++++++++------------------------- tests/test_lzw.py | 5 ++-- 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/pypdf/lzw.py b/pypdf/lzw.py index 44b0fe817..e5b9eb042 100644 --- a/pypdf/lzw.py +++ b/pypdf/lzw.py @@ -4,75 +4,73 @@ class LzwCodec: - CLEAR_TABLE_MARKER = 256 - EOD_MARKER = 257 + """Lempel-Ziv-Welch (LZW) adaptive compression codec.""" - # Data encoded using the LZW compression method shall consist of - # a sequence of codes that are 9 to 12 bits long - MAX_CODE_WIDTH = 12 + CLEAR_TABLE_MARKER = 256 # Special code to indicate table reset + EOD_MARKER = 257 # End-of-data marker + MAX_CODE_WIDTH = 12 # Codes can range from 9 to 12 bits def __init__(self) -> None: + """Initialize codec and reset the compression table.""" self.clear_table() def clear_table(self) -> None: - """Clear the table.""" - # the 258 fixed codes + """Reset the encoding table to initial state with single-byte sequences.""" self.table = {bytes([i]): i for i in range(256)} - self.next_code = 258 + self.next_code = self.EOD_MARKER + 1 def encode(self, data: bytes) -> bytes: """ - Encode byte data with LZW compression. + Encode data using the LZW compression algorithm. Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding". """ - max_table_size = 1 << self.MAX_CODE_WIDTH # 4096 - + max_table_size = 1 << self.MAX_CODE_WIDTH # 4096 entries when fully expanded result_codes = [] # The encoder shall begin by issuing a clear-table code result_codes.append(self.CLEAR_TABLE_MARKER) - string = b"" - for int_character in data: - character = bytes([int_character]) - if string + character in self.table: - # Accumulate a sequence of one or more input characters - # matching a sequence already present in the table. - # For maximum compression, the encoder looks for the longest - # such sequence. - string += character + current_sequence = b"" + for byte in data: + next_sequence = current_sequence + bytes([byte]) + + if next_sequence in self.table: + # Extend current sequence if already in the table + current_sequence = next_sequence else: - # Emit the code corresponding to that sequence. - result_codes.append(self.table[string]) + # Output code for the current sequence + result_codes.append(self.table[current_sequence]) - # Before adding a new entry, check if the table is full + # If the table is full, emit a clear-table command if len(self.table) >= max_table_size: - # Table is full, emit clear-table code and reset result_codes.append(self.CLEAR_TABLE_MARKER) self.clear_table() - # bits_per_code will be reset in pack_codes_into_bytes else: - # Add new sequence to the table .. - self.table[string + character] = self.next_code + # Add the new sequence to the table + self.table[next_sequence] = self.next_code self.next_code += 1 - string = character + # Reset to the new character + current_sequence = bytes([byte]) # Ensure everything actually is encoded - if string: - result_codes.append(self.table[string]) + if current_sequence: + result_codes.append(self.table[current_sequence]) result_codes.append(self.EOD_MARKER) return self.pack_codes_into_bytes(result_codes) def pack_codes_into_bytes(self, result_codes: List[int]) -> bytes: - """Convert the result code list into bytes.""" + """ + Convert the list of result codes into a continuous byte stream, with codes packed as per the current bit-width. + The bit-width starts at 9 bits and expands as needed. + """ bits_per_code = 9 # Initially, the code length shall be 9 bits max_code = 1 << bits_per_code # 512 - buffer = 0 - bits_in_buffer = 0 + buffer = 0 # Temporary storage for bits to be packed into bytes + bits_in_buffer = 0 # Number of bits currently in the buffer output = [] for code in result_codes: @@ -86,21 +84,18 @@ def pack_codes_into_bytes(self, result_codes: List[int]) -> bytes: bits_in_buffer -= 8 output.append((buffer >> bits_in_buffer) & 0xFF) - # Handle bits_per_code reset after clear-table code + # After a clear-table marker, reset to 9-bit codes if code == self.CLEAR_TABLE_MARKER: bits_per_code = 9 max_code = 1 << bits_per_code continue - # Whenever both the encoder and the decoder independently (but - # synchronously) realize that the current code length is no longer - # sufficient to represent the number of entries in the table, they shall - # increase the number of bits per code by 1. + # Expand the code width if the next code exceeds the current range if code >= max_code - 1 and bits_per_code < self.MAX_CODE_WIDTH: bits_per_code += 1 - max_code <<= 1 + max_code <<= 1 # Double the range for the new bit-width - # Flush the buffer + # Flush any remaining bits in the buffer if bits_in_buffer > 0: output.append((buffer << (8 - bits_in_buffer)) & 0xFF) diff --git a/tests/test_lzw.py b/tests/test_lzw.py index 100219394..16398a3b8 100644 --- a/tests/test_lzw.py +++ b/tests/test_lzw.py @@ -3,7 +3,7 @@ import pytest from pypdf.filters import LZWDecode -from pypdf.lzw import lzw_encode +from pypdf.lzw import LzwCodec test_cases = [ pytest.param(b"", id="Empty input"), @@ -23,7 +23,8 @@ @pytest.mark.parametrize("data", test_cases) def test_encode_decode(data): """Decoder and encoder match.""" - compressed_data = lzw_encode(data) + codec = LzwCodec() + compressed_data = codec.encode(data) decoded = LZWDecode._decodeb(compressed_data) msg = f"len(data)={len(data)}, len(decoded)={len(decoded)}, len(compressed_data)={len(compressed_data)}" if decoded != data: From 5cc104896d6cb5669a1d626621eca8324b611c02 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Sep 2024 10:21:07 +0200 Subject: [PATCH 05/10] Fix off-by-one errors --- pypdf/lzw.py | 77 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/pypdf/lzw.py b/pypdf/lzw.py index e5b9eb042..133277b3b 100644 --- a/pypdf/lzw.py +++ b/pypdf/lzw.py @@ -1,6 +1,6 @@ """Lempel-Ziv-Welch (LZW) adaptive compression method.""" -from typing import List +from typing import Dict, List class LzwCodec: @@ -8,7 +8,8 @@ class LzwCodec: CLEAR_TABLE_MARKER = 256 # Special code to indicate table reset EOD_MARKER = 257 # End-of-data marker - MAX_CODE_WIDTH = 12 # Codes can range from 9 to 12 bits + INITIAL_BITS_PER_CODE = 9 # Initial code bit width + MAX_BITS_PER_CODE = 12 # Maximum code bit width def __init__(self) -> None: """Initialize codec and reset the compression table.""" @@ -16,8 +17,10 @@ def __init__(self) -> None: def clear_table(self) -> None: """Reset the encoding table to initial state with single-byte sequences.""" - self.table = {bytes([i]): i for i in range(256)} + self.table: Dict[bytes, int] = {bytes([i]): i for i in range(256)} self.next_code = self.EOD_MARKER + 1 + self.bits_per_code = self.INITIAL_BITS_PER_CODE + self.max_code_value = (1 << self.bits_per_code) - 1 def encode(self, data: bytes) -> bytes: """ @@ -25,11 +28,11 @@ def encode(self, data: bytes) -> bytes: Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding". """ - max_table_size = 1 << self.MAX_CODE_WIDTH # 4096 entries when fully expanded - result_codes = [] + result_codes: List[int] = [] # The encoder shall begin by issuing a clear-table code result_codes.append(self.CLEAR_TABLE_MARKER) + self.clear_table() current_sequence = b"" for byte in data: @@ -42,16 +45,23 @@ def encode(self, data: bytes) -> bytes: # Output code for the current sequence result_codes.append(self.table[current_sequence]) - # If the table is full, emit a clear-table command - if len(self.table) >= max_table_size: - result_codes.append(self.CLEAR_TABLE_MARKER) - self.clear_table() - else: - # Add the new sequence to the table + # Add the new sequence to the table if there's room + if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1: self.table[next_sequence] = self.next_code self.next_code += 1 + # Increase bits_per_code if necessary + if ( + self.next_code > self.max_code_value + and self.bits_per_code < self.MAX_BITS_PER_CODE + ): + self.bits_per_code += 1 + self.max_code_value = (1 << self.bits_per_code) - 1 + else: + # If the table is full, emit a clear-table command + result_codes.append(self.CLEAR_TABLE_MARKER) + self.clear_table() - # Reset to the new character + # Start new sequence current_sequence = bytes([byte]) # Ensure everything actually is encoded @@ -62,18 +72,20 @@ def encode(self, data: bytes) -> bytes: return self.pack_codes_into_bytes(result_codes) - def pack_codes_into_bytes(self, result_codes: List[int]) -> bytes: + def pack_codes_into_bytes(self, codes: List[int]) -> bytes: """ - Convert the list of result codes into a continuous byte stream, with codes packed as per the current bit-width. + Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width. The bit-width starts at 9 bits and expands as needed. """ - bits_per_code = 9 # Initially, the code length shall be 9 bits - max_code = 1 << bits_per_code # 512 - buffer = 0 # Temporary storage for bits to be packed into bytes - bits_in_buffer = 0 # Number of bits currently in the buffer - output = [] - - for code in result_codes: + bits_per_code = self.INITIAL_BITS_PER_CODE + max_bits_per_code = self.MAX_BITS_PER_CODE + max_code_value = (1 << bits_per_code) - 1 + next_code = self.EOD_MARKER + 1 + buffer = 0 + bits_in_buffer = 0 + output = bytearray() + + for code in codes: buffer = (buffer << bits_per_code) | code bits_in_buffer += bits_per_code @@ -84,16 +96,21 @@ def pack_codes_into_bytes(self, result_codes: List[int]) -> bytes: bits_in_buffer -= 8 output.append((buffer >> bits_in_buffer) & 0xFF) - # After a clear-table marker, reset to 9-bit codes + # After a clear-table marker, reset bits_per_code and next_code if code == self.CLEAR_TABLE_MARKER: - bits_per_code = 9 - max_code = 1 << bits_per_code - continue - - # Expand the code width if the next code exceeds the current range - if code >= max_code - 1 and bits_per_code < self.MAX_CODE_WIDTH: - bits_per_code += 1 - max_code <<= 1 # Double the range for the new bit-width + bits_per_code = self.INITIAL_BITS_PER_CODE + max_code_value = (1 << bits_per_code) - 1 + next_code = self.EOD_MARKER + 1 + elif code == self.EOD_MARKER: + # Do not increment next_code for EOD_MARKER + pass + else: + # Increase next_code after processing each code (except special codes) + next_code += 1 + # Increase bits_per_code if necessary + if next_code > max_code_value and bits_per_code < max_bits_per_code: + bits_per_code += 1 + max_code_value = (1 << bits_per_code) - 1 # Flush any remaining bits in the buffer if bits_in_buffer > 0: From 59ec6b95b61838df4cb5abc06f825d5f1a7c98fc Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Sep 2024 10:25:57 +0200 Subject: [PATCH 06/10] Code reuse --- pypdf/lzw.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pypdf/lzw.py b/pypdf/lzw.py index 133277b3b..67054c75b 100644 --- a/pypdf/lzw.py +++ b/pypdf/lzw.py @@ -16,7 +16,7 @@ def __init__(self) -> None: self.clear_table() def clear_table(self) -> None: - """Reset the encoding table to initial state with single-byte sequences.""" + """Reset the encoding table and coding state to initial conditions.""" self.table: Dict[bytes, int] = {bytes([i]): i for i in range(256)} self.next_code = self.EOD_MARKER + 1 self.bits_per_code = self.INITIAL_BITS_PER_CODE @@ -77,17 +77,15 @@ def pack_codes_into_bytes(self, codes: List[int]) -> bytes: Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width. The bit-width starts at 9 bits and expands as needed. """ - bits_per_code = self.INITIAL_BITS_PER_CODE - max_bits_per_code = self.MAX_BITS_PER_CODE - max_code_value = (1 << bits_per_code) - 1 - next_code = self.EOD_MARKER + 1 + # Reset coding state + self.clear_table() buffer = 0 bits_in_buffer = 0 output = bytearray() for code in codes: - buffer = (buffer << bits_per_code) | code - bits_in_buffer += bits_per_code + buffer = (buffer << self.bits_per_code) | code + bits_in_buffer += self.bits_per_code # Codes shall be packed into a continuous bit stream, high-order bit # first. This stream shall then be divided into bytes, high-order bit @@ -96,21 +94,22 @@ def pack_codes_into_bytes(self, codes: List[int]) -> bytes: bits_in_buffer -= 8 output.append((buffer >> bits_in_buffer) & 0xFF) - # After a clear-table marker, reset bits_per_code and next_code + # After a clear-table marker, reset coding state if code == self.CLEAR_TABLE_MARKER: - bits_per_code = self.INITIAL_BITS_PER_CODE - max_code_value = (1 << bits_per_code) - 1 - next_code = self.EOD_MARKER + 1 + self.clear_table() elif code == self.EOD_MARKER: # Do not increment next_code for EOD_MARKER pass else: # Increase next_code after processing each code (except special codes) - next_code += 1 + self.next_code += 1 # Increase bits_per_code if necessary - if next_code > max_code_value and bits_per_code < max_bits_per_code: - bits_per_code += 1 - max_code_value = (1 << bits_per_code) - 1 + if ( + self.next_code > self.max_code_value + and self.bits_per_code < self.MAX_BITS_PER_CODE + ): + self.bits_per_code += 1 + self.max_code_value = (1 << self.bits_per_code) - 1 # Flush any remaining bits in the buffer if bits_in_buffer > 0: From 42a2d981297f9a2efda2bf35fe7ee32a33caa864 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Sep 2024 10:46:14 +0200 Subject: [PATCH 07/10] Add ABC --- pypdf/{lzw.py => codecs.py} | 44 +++++++++++++++++++++++++-- tests/{test_lzw.py => test_codecs.py} | 10 ++---- 2 files changed, 44 insertions(+), 10 deletions(-) rename pypdf/{lzw.py => codecs.py} (83%) rename tests/{test_lzw.py => test_codecs.py} (96%) diff --git a/pypdf/lzw.py b/pypdf/codecs.py similarity index 83% rename from pypdf/lzw.py rename to pypdf/codecs.py index 67054c75b..d6858d095 100644 --- a/pypdf/lzw.py +++ b/pypdf/codecs.py @@ -1,9 +1,43 @@ -"""Lempel-Ziv-Welch (LZW) adaptive compression method.""" +""" +This module is for codecs only. +While the codec implementation can contain details of the PDF specification, +the module should not do any PDF parsing. +""" + +from abc import ABC, abstractmethod from typing import Dict, List -class LzwCodec: +class Codec(ABC): + """Abstract base class for all codecs.""" + + @abstractmethod + def encode(self, data: bytes) -> bytes: + """ + Encode the input data. + + Args: + data: Data to encode. + + Returns: + Encoded data. + """ + + @abstractmethod + def decode(self, data: bytes) -> bytes: + """ + Decode the input data. + + Args: + data: Data to decode. + + Returns: + Decoded data. + """ + + +class LzwCodec(Codec): """Lempel-Ziv-Welch (LZW) adaptive compression codec.""" CLEAR_TABLE_MARKER = 256 # Special code to indicate table reset @@ -116,3 +150,9 @@ def pack_codes_into_bytes(self, codes: List[int]) -> bytes: output.append((buffer << (8 - bits_in_buffer)) & 0xFF) return bytes(output) + + def decode(self, data: bytes) -> bytes: + """Decode data using LZW.""" + from .filters import LZWDecode + + return LZWDecode._decodeb(data) diff --git a/tests/test_lzw.py b/tests/test_codecs.py similarity index 96% rename from tests/test_lzw.py rename to tests/test_codecs.py index 16398a3b8..f96e543a5 100644 --- a/tests/test_lzw.py +++ b/tests/test_codecs.py @@ -2,8 +2,7 @@ import pytest -from pypdf.filters import LZWDecode -from pypdf.lzw import LzwCodec +from pypdf.codecs import LzwCodec test_cases = [ pytest.param(b"", id="Empty input"), @@ -25,10 +24,5 @@ def test_encode_decode(data): """Decoder and encoder match.""" codec = LzwCodec() compressed_data = codec.encode(data) - decoded = LZWDecode._decodeb(compressed_data) - msg = f"len(data)={len(data)}, len(decoded)={len(decoded)}, len(compressed_data)={len(compressed_data)}" - if decoded != data: - for i in range(len(data)): - assert decoded[i] == data[i], msg - + decoded = codec.decode(compressed_data) assert decoded == data From 1ab81632b0cbdba0c80cc3e79b0c1a9f98e90b21 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Sep 2024 12:55:18 +0200 Subject: [PATCH 08/10] pass data to init --- pypdf/codecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/codecs.py b/pypdf/codecs.py index d6858d095..0f2a34df1 100644 --- a/pypdf/codecs.py +++ b/pypdf/codecs.py @@ -155,4 +155,4 @@ def decode(self, data: bytes) -> bytes: """Decode data using LZW.""" from .filters import LZWDecode - return LZWDecode._decodeb(data) + return LZWDecode.Decoder(data).decode() From 5d1248d64cbbcd1dd299863d5e64a16a92e23b21 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Sep 2024 14:23:06 +0200 Subject: [PATCH 09/10] Make it private --- pypdf/{codecs.py => _codecs/_codecs.py} | 2 +- tests/test_codecs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename pypdf/{codecs.py => _codecs/_codecs.py} (99%) diff --git a/pypdf/codecs.py b/pypdf/_codecs/_codecs.py similarity index 99% rename from pypdf/codecs.py rename to pypdf/_codecs/_codecs.py index 0f2a34df1..ce2659548 100644 --- a/pypdf/codecs.py +++ b/pypdf/_codecs/_codecs.py @@ -153,6 +153,6 @@ def pack_codes_into_bytes(self, codes: List[int]) -> bytes: def decode(self, data: bytes) -> bytes: """Decode data using LZW.""" - from .filters import LZWDecode + from ..filters import LZWDecode return LZWDecode.Decoder(data).decode() diff --git a/tests/test_codecs.py b/tests/test_codecs.py index f96e543a5..8b5626fed 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -2,7 +2,7 @@ import pytest -from pypdf.codecs import LzwCodec +from pypdf._codecs._codecs import LzwCodec test_cases = [ pytest.param(b"", id="Empty input"), From 65647683cf6dc73ee8415b554ab6da59d50b7932 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Sep 2024 14:29:40 +0200 Subject: [PATCH 10/10] Test encoded value --- tests/test_codecs.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 8b5626fed..5113cca2c 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -26,3 +26,18 @@ def test_encode_decode(data): compressed_data = codec.encode(data) decoded = codec.decode(compressed_data) assert decoded == data + + +@pytest.mark.parametrize( + ("plain", "expected_encoded"), + [ + (b"", b"\x80@@"), + (b"A", b"\x80\x10` "), + (b"AAAAAA", b"\x80\x10`P8\x08"), + (b"Hello, World!", b"\x80\x12\x0c\xa6\xc3a\xbcX +\x9b\xceF\xc3 \x86\x02"), + ], +) +def test_encode_lzw(plain, expected_encoded): + codec = LzwCodec() + actual_encoded = codec.encode(plain) + assert actual_encoded == expected_encoded