From fecf1f7a1f0123a2dd0a4eb5f8db9da613a04574 Mon Sep 17 00:00:00 2001 From: Chris Griffith Date: Thu, 8 Aug 2024 14:07:26 -0500 Subject: [PATCH] Version 1.27 (#98) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adding new verbose output to command line with `-v` or `--verbose` - Adding #92 include py.typed in sdist (thanks to Nicholas Bollweg - bollwyvl) - Adding #93 Improve PDF file detection, fix json description (thanks to Péter - peterekepeter) - Fixing #96 #86 stream does not work properly on opened small files (thanks to Felipe Lema and Andy - NebularNerd) - Removing expected invalid WinZip signature --------- Co-authored-by: Nicholas Bollweg Co-authored-by: Péter Co-authored-by: Andy --- .pre-commit-config.yaml | 15 ++++++++++----- AUTHORS.rst | 5 ++++- CHANGELOG.md | 9 +++++++++ MANIFEST.in | 1 + README.rst | 4 ++-- puremagic/magic_data.json | 29 ++++++++++++++++++++++------- puremagic/main.py | 34 +++++++++++++++++++++++++++------- test/test_common_extensions.py | 17 ++++++++++++++++- 8 files changed, 91 insertions(+), 23 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ceddd41..cddb731 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,28 +24,33 @@ repos: exclude: ^test/resources/ - id: trailing-whitespace args: [--markdown-linebreak-ext=md] - exclude: ^test/resources/ + exclude: | + (?x)^( + ^test/resources/.+| + ^puremagic/magic_data.json + )$ - id: check-executables-have-shebangs - id: end-of-file-fixer exclude: ^test/resources/.+ + - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.6 + rev: v0.5.7 hooks: - id: ruff - repo: https://github.com/ambv/black - rev: 24.4.2 + rev: 24.8.0 hooks: - id: black - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.10.0' + rev: 'v1.11.1' hooks: - id: mypy - repo: https://github.com/tox-dev/pyproject-fmt - rev: 2.1.3 + rev: 2.2.1 hooks: - id: pyproject-fmt diff --git a/AUTHORS.rst b/AUTHORS.rst index 4462eb7..2c924a3 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -20,4 +20,7 @@ A big thank you to everyone that has helped! - Andy (NebularNerd) - Raphaël Vinot (Rafiot) - Sebastian Kreft (sk-) -- William Bonnaventure (Aztorius) +- William Bonnaventure (Aztorius) +- Nicholas Bollweg (bollwyvl) +- Péter (peterekepeter) +- mara004 diff --git a/CHANGELOG.md b/CHANGELOG.md index c0d2859..cc90e76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,15 @@ Changelog ========= +Version 1.27 +------------ + +- Adding new verbose output to command line with `-v` or `--verbose` +- Adding #92 include py.typed in sdist (thanks to Nicholas Bollweg - bollwyvl) +- Adding #93 Improve PDF file detection, fix json description (thanks to Péter - peterekepeter) +- Fixing #96 #86 stream does not work properly on opened small files (thanks to Felipe Lema and Andy - NebularNerd) +- Removing expected invalid WinZip signature + Version 1.26 ------------ diff --git a/MANIFEST.in b/MANIFEST.in index 2d954e1..63a2d20 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include puremagic/*.json +include puremagic/py.typed include LICENSE include AUTHORS.rst include CHANGELOG.md diff --git a/README.rst b/README.rst index f166110..fb562b7 100644 --- a/README.rst +++ b/README.rst @@ -35,7 +35,7 @@ Disadvantages: Compatibility ~~~~~~~~~~~~~ -- Python 3.8+ +- Python 3.7+ Using github ci to run continuous integration tests on listed platforms. @@ -174,7 +174,7 @@ http://www.garykessler.net/library/file_sigs.html Freedesktop.org -For use of their shared-mime-info file (even if they do use XML, blea), available at: +For use of their shared-mime-info file, available at: https://cgit.freedesktop.org/xdg/shared-mime-info/ License diff --git a/puremagic/magic_data.json b/puremagic/magic_data.json index 081a7eb..3d8aa40 100644 --- a/puremagic/magic_data.json +++ b/puremagic/magic_data.json @@ -20,7 +20,7 @@ ["", 0, ".crt", "text/plain", "X.509 Certificate"], ["", 0, ".reg", "", "Windows Registry File"], ["", 0, ".md", "text/plain", "Markdown File"], - ["", 0, ".json", "application/json", "Markdown File"], + ["", 0, ".json", "application/json", "JSON File"], ["", 0, ".rst", "text/plain", "Restructured Text File"], ["", 0, ".cfg", "text/plain", "Configuration File"], ["", 0, ".flake8", "text/plain", "Flake 8 Configuration File"], @@ -28,6 +28,7 @@ ["", 0, ".c", "text/x-csrc", "C Code File"], ["", 0, ".cc", "text/x-csrc", "C Code File"], ["", 0, ".h", "text/x-csrc", "C Header File"], + ["", 0, ".pdf", "application/pdf", "Adobe Portable Document Format file"], ["", 0, ".stl", "model/stl", "stereolithography CAD software"], ["", 0, ".srt", "application/x-subrip", "SubRip subtitles"], ["", 0, ".obj", "", "Relocatable object code"], @@ -63,9 +64,13 @@ ["", 0, ".b6i", "", "BlindWrite 6 Image File"], ["", 0, ".cl2", "", "Adaptec Easy CD/DVD Creator image file"], ["", 0, ".cl3", "", "Adaptec Easy CD/DVD Creator image file"], - ["", 0, ".cl4", "", "Adaptec Easy CD/DVD Creator image file"] + ["", 0, ".cl4", "", "Adaptec Easy CD/DVD Creator image file"], + ["", 0, ".vba", "", "Visual Basic Script"], + ["", 0, "README", "text/plain", "README File"] ], "multi-part": { + "7b22": [["227d", -2, ".json", "application/json", "JSON File"]], + "7b": [["22", -1, ".json", "application/json", "JSON File"]], "464f524d": [ ["494c424d", 8, ".iff", "image/x-ilbm", "IFF Interleaved Bitmap Image"], ["38535658", 8, ".iff", "audio/x-8svx", "IFF 8-Bit Sampled Voice"], @@ -515,7 +520,7 @@ ], "73696262" : [ ["72686c62", 8, ".uif", "", "MagicISO Disk Image (Encrypted)"] - ] + ] }, "footers": [ ["54525545564953494f4e2d5846494c452e00", -18, ".tga", "image/tga", "Truevision Targa Graphic file"], @@ -525,9 +530,19 @@ ["3c2f7376673e", -6, ".svg", "image/svg+xml", "Scalable Vector Graphics Image"], ["6b6f6c79", -512, ".dmg", "application/x-apple-diskimage", "MacOS X image file"], ["4e45524f", -8, ".nrg", "", "Nero Disk Image (Version 1)"], - ["4e455235", -12, ".nrg", "", "Nero Disk Image (Version 2)"] + ["4e455235", -12, ".nrg", "", "Nero Disk Image (Version 2)"] ], "headers": [ + ["595556344d504547",0, ".y4m", "video/x-yuv4mpeg", "YUV4MPEG2 video file"], + ["3c68746d6c", 0, ".html", "text/html", "HTML File"], + ["424c5545", 0, ".bvr", "", "Blue Iris Video File"], + ["2d2d2d2d2d424547494e20504b4353372d2d2d2d2d", 0, ".p7b", "", "PKCS 7 Certificate File" ], + ["7b22", 0, ".json", "application/json", "JSON File"], + ["7b", 0, ".json", "application/json", "JSON File"], + ["50755454592d557365722d4b65792d46696c65", 0, ".ppk", "", "PuTTY User Key File"], + ["2d2d2d2d20424547494e2053534832205055424c4943204b4559202d2d2d2d", 0, "", "", "SSH Public Key"], + ["2d2d2d2d424547494e", 0, "", "", "Key or Cert File"], + ["2d2d2d2d20424547494e", 0, "", "", "Key or Cert File"], ["30313233343536373839", 0, ".puremagic_multi_footer", "text/ascii", "TESTFILE"], ["ff0a", 0, ".jxl", "image/jxl", "JPEG XL image (Raw stream)"], ["0000000c4a584c200d0a870a", 0, ".jxl", "image/jxl", "JPEG XL image (ISOBMFF container)"], @@ -580,7 +595,7 @@ ["425a68", 0, ".bzip2", "application/x-bzip2", "BZIP2 Compressed Archive file"], ["664c614300000022", 0, ".flac", "audio/flac", "Free Lossless Audio Codec file"], ["434f5744", 0, ".vmdk", "application/octet-stream", "VMware Sparse Extent Image file"], - ["23204469736b2044657363726970746f7246696c65", 0, ".vmdk", "application/octet-stream", "VMware Image Descriptor File"], + ["23204469736b2044657363726970746f7246696c65", 0, ".vmdk", "application/octet-stream", "VMware Image Descriptor File"], ["4b444d56", 0, ".vmdk", "application/octet-stream", "VMware Virtual Single Disk file"], ["e310000100000000", 0, ".info", "", "Amiga icon"], ["5468697320697320", 0, ".info", "", "GNU Info Reader file"], @@ -745,6 +760,7 @@ ["000100004d534953414d204461746162617365", 0, ".mny", "application/x-msmoney", "Microsoft Money file"], ["000100005374616e64617264204a6574204442", 0, ".mdb", "application/x-msaccess", "Microsoft Access file"], ["25504446", 0, ".pdf", "application/pdf", "Adobe Portable Document Format file"], + ["0d0a25504446", 0, ".pdf", "application/pdf", "Adobe Portable Document Format file"], ["a0461df0", 512, ".ppt", "application/vnd.ms-powerpoint", "Microsoft Office PowerPoint Presentation file"], ["cf11e0a1b11ae100", 0, ".doc", "application/msword", "Perfect Office Document file"], ["d0cf11e0a1b11ae1", 0, ".doc", "application/msword", "Microsoft Office Document file"], @@ -926,7 +942,6 @@ ["564350434830", 0, ".pch", "", "Visual C PreCompiled header"], ["554641c6d2c1", 0, ".ufa", "", "UFA compressed archive"], ["ac9ebd8f0000", 0, ".qdf", "", "Quicken data"], - ["57696e5a6970", 29152, ".zip", "application/zip", "WinZip compressed archive"], ["504147454455", 0, ".dmp", "", "Windows memory dump"], ["4d444d5093a7", 0, ".dmp", "", "Windows dump file"], ["458600000600", 0, ".qbb", "", "QuickBooks backup"], @@ -2006,7 +2021,7 @@ ["5b436c6f6e6543445d", 0, ".ccd", "", "CloneCD Control File"], ["ffffffffffffffffffffffff", 0, ".sub", "", "CloneCD Sub Channel File"], ["00ffffffffffffffffffff", 0, ".img", "", "CloneCD Image File"], - ["f7fff9fffdfffbfff6fff7fff7fff5fff8fff7fff5fff0fffcfffafffafff7fff8fff6fff7fff7fff2fff2fff8", 0, ".img", "", "CloneCD Image File"], + ["f7fff9fffdfffbfff6fff7fff7fff5fff8fff7fff5fff0fffcfffafffafff7fff8fff6fff7fff7fff2fff2fff8", 0, ".img", "", "CloneCD Image File"], ["ffffffffffffffffffffffff", 0, ".b5i", "", "BlindWrite 5 Image File"], ["425754352053545245414d205349474e", 0, ".b5t", "", "BlindWrite 5 Stream File"], ["425754352053545245414d205349474e", 0, ".b6t", "", "BlindWrite 6 Stream File"], diff --git a/puremagic/main.py b/puremagic/main.py index a2d2416..df4cae6 100644 --- a/puremagic/main.py +++ b/puremagic/main.py @@ -21,7 +21,7 @@ from itertools import chain __author__ = "Chris Griffith" -__version__ = "1.26" +__version__ = "1.27" __all__ = [ "magic_file", "magic_string", @@ -114,6 +114,9 @@ def _max_lengths() -> tuple[int, int]: return max_header_length, max_footer_length +max_head, max_foot = _max_lengths() + + def _confidence(matches, ext=None) -> list[PureMagicWithConfidence]: """Rough confidence based on string length and file extension""" results = [] @@ -133,7 +136,7 @@ def _confidence(matches, ext=None) -> list[PureMagicWithConfidence]: if not results: raise PureError("Could not identify file") - return sorted(results, key=lambda x: (x.confidence, x.byte_match), reverse=True) + return sorted(results, key=lambda x: (x.confidence, len(x.byte_match)), reverse=True) def _identify_all(header: bytes, footer: bytes, ext=None) -> list[PureMagicWithConfidence]: @@ -205,7 +208,6 @@ def _magic(header: bytes, footer: bytes, mime: bool, ext=None) -> str: def _file_details(filename: os.PathLike | str) -> tuple[bytes, bytes]: """Grab the start and end of the file""" - max_head, max_foot = _max_lengths() with open(filename, "rb") as fin: head = fin.read(max_head) try: @@ -218,15 +220,17 @@ def _file_details(filename: os.PathLike | str) -> tuple[bytes, bytes]: def _string_details(string): """Grab the start and end of the string""" - max_head, max_foot = _max_lengths() return string[:max_head], string[-max_foot:] def _stream_details(stream): """Grab the start and end of the stream""" - max_head, max_foot = _max_lengths() head = stream.read(max_head) - stream.seek(-max_foot, os.SEEK_END) + try: + stream.seek(-max_foot, os.SEEK_END) + except OSError: + # File is smaller than the max_foot size, jump to beginning + stream.seek(0) foot = stream.read() stream.seek(0) return head, foot @@ -374,6 +378,7 @@ def command_line_entry(*args): dest="mime", help="Return the mime type instead of file type", ) + parser.add_argument("-v", "--v", action="store_true", dest="verbose", help="Print verbose output") parser.add_argument("files", nargs="+") args = parser.parse_args(args if args else sys.argv[1:]) @@ -385,6 +390,21 @@ def command_line_entry(*args): print(f"'{fn}' : {from_file(fn, args.mime)}") except PureError: print(f"'{fn}' : could not be Identified") + continue + if args.verbose: + matches = magic_file(fn) + print(f"Total Possible Matches: {len(matches)}") + for i, result in enumerate(matches): + if i == 0: + print("\n\tBest Match") + else: + print(f"\tAlertnative Match #{i}") + print(f"\tName: {result.name}") + print(f"\tConfidence: {int(result.confidence * 100)}%") + print(f"\tExtension: {result.extension}") + print(f"\tMime Type: {result.mime_type}") + print(f"\tByte Match: {result.byte_match}") + print(f"\tOffset: {result.offset}\n") imghdr_bug_for_bug = { # Special cases where imghdr is probably incorrect. @@ -444,5 +464,5 @@ def what(file: os.PathLike | str | None, h: bytes | None = None, imghdr_strict: return imghdr_exts.get(ext, ext) -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover command_line_entry() diff --git a/test/test_common_extensions.py b/test/test_common_extensions.py index 630598e..f167d9b 100644 --- a/test/test_common_extensions.py +++ b/test/test_common_extensions.py @@ -9,6 +9,7 @@ import puremagic LOCAL_DIR = os.path.realpath(os.path.dirname(__file__)) +RESOUCE_DIR = os.path.join(LOCAL_DIR, "resources") IMAGE_DIR = os.path.join(LOCAL_DIR, "resources", "images") VIDEO_DIR = os.path.join(LOCAL_DIR, "resources", "video") AUDIO_DIR = os.path.join(LOCAL_DIR, "resources", "audio") @@ -19,6 +20,14 @@ TGA_FILE = os.path.join(IMAGE_DIR, "test.tga") +class MockBytesIO(BytesIO): + + def seek(self, offset, whence=0): + if offset < 0: + raise OSError("Invalid seek position") + return super().seek(offset, whence) + + class TestMagic(unittest.TestCase): def setUp(self): self.mp4magic = b"\x00\x00\x00\x1c\x66\x74\x79\x70\x4d\x53\x4e\ @@ -127,6 +136,10 @@ def test_magic_stream(self): self.assertEqual(result[0].extension, ".tga") self.assertRaises(ValueError, puremagic.magic_stream, BytesIO(b"")) + def test_small_stream_error(self): + ext = puremagic.from_stream(MockBytesIO(b"#!/usr/bin/env python")) + self.assertEqual(ext, ".py") + def test_mime(self): """Identify mime type""" self.assertEqual(puremagic.from_file(TGA_FILE, True), "image/tga") @@ -171,7 +184,9 @@ def test_cmd_options(self): """Test CLI options""" from puremagic.main import command_line_entry - command_line_entry(__file__, "test.py") + command_line_entry(__file__, os.path.join(AUDIO_DIR, "test.mp3"), "-v") + command_line_entry(__file__, "DOES NOT EXIST FILE") + command_line_entry(__file__, os.path.join(RESOUCE_DIR, "fake_file"), "-v") def test_bad_magic_input(self): """Test bad magic input"""