Skip to content

Commit

Permalink
Small performance improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
cclauss committed May 21, 2024
1 parent f445e2e commit e0d70ed
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 29 deletions.
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ repos:
- id: check-docstring-first
- id: debug-statements
- id: requirements-txt-fixer
- id: fix-encoding-pragma
- id: fix-byte-order-marker
# General quality checks
- id: mixed-line-ending
Expand Down
1 change: 0 additions & 1 deletion puremagic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from puremagic.main import __version__, __author__
from puremagic.main import *
1 change: 0 additions & 1 deletion puremagic/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from puremagic.main import command_line_entry

command_line_entry()
36 changes: 17 additions & 19 deletions puremagic/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
puremagic is a pure python module that will identify a file based off it's
magic numbers. It is designed to be minimalistic and inherently cross platform
Expand All @@ -14,12 +13,11 @@
"""
from __future__ import annotations

import os
import json
import binascii
from itertools import chain
import os
from binascii import unhexlify
from collections import namedtuple
from typing import Union, Tuple, List, Dict, Optional
from itertools import chain

__author__ = "Chris Griffith"
__version__ = "1.23"
Expand Down Expand Up @@ -82,15 +80,13 @@ def _magic_data(
extensions = [_create_puremagic(x) for x in data["extension_only"]]
multi_part_extensions = {}
for file_match, option_list in data["multi-part"].items():
multi_part_extensions[binascii.unhexlify(file_match.encode("ascii"))] = [
_create_puremagic(x) for x in option_list
]
multi_part_extensions[unhexlify(file_match.encode("ascii"))] = [_create_puremagic(x) for x in option_list]
return headers, footers, extensions, multi_part_extensions


def _create_puremagic(x: list) -> PureMagic:
return PureMagic(
byte_match=binascii.unhexlify(x[0].encode("ascii")),
byte_match=unhexlify(x[0].encode("ascii")),
offset=x[1],
extension=x[2],
mime_type=x[3],
Expand Down Expand Up @@ -120,15 +116,17 @@ def _confidence(matches, ext=None) -> list[PureMagicWithConfidence]:
"""Rough confidence based on string length and file extension"""
results = []
for match in matches:
con = 0.8 if len(match.byte_match) >= 9 else float("0.{0}".format(len(match.byte_match)))
con = 0.8 if len(match.byte_match) >= 9 else float(f"0.{len(match.byte_match)}")
if con >= 0.1 and ext and ext == match.extension:
con = 0.9
results.append(PureMagicWithConfidence(confidence=con, **match._asdict()))

if not results and ext:
for magic_row in extension_only_array:
if ext == magic_row.extension:
results.append(PureMagicWithConfidence(confidence=0.1, **magic_row._asdict()))
results = [
PureMagicWithConfidence(confidence=0.1, **magic_row._asdict())
for magic_row in extension_only_array
if ext == magic_row.extension
]

if not results:
raise PureError("Could not identify file")
Expand All @@ -141,7 +139,7 @@ def _identify_all(header: bytes, footer: bytes, ext=None) -> list[PureMagicWithC

# Capture the length of the data
# That way we do not try to identify bytes that don't exist
matches = list()
matches = []
for magic_row in magic_header_array:
start = magic_row.offset
end = magic_row.offset + len(magic_row.byte_match)
Expand Down Expand Up @@ -210,7 +208,7 @@ def _file_details(filename: os.PathLike | str) -> tuple[bytes, bytes]:
head = fin.read(max_head)
try:
fin.seek(-max_foot, os.SEEK_END)
except IOError:
except OSError:
fin.seek(0)
foot = fin.read()
return head, foot
Expand Down Expand Up @@ -242,7 +240,7 @@ def ext_from_filename(filename: os.PathLike | str) -> str:
base, ext = str(filename).lower().rsplit(".", 1)
except ValueError:
return ""
ext = ".{0}".format(ext)
ext = f".{ext}"
all_exts = [x.extension for x in chain(magic_header_array, magic_footer_array)]

if base[-4:].startswith("."):
Expand Down Expand Up @@ -379,12 +377,12 @@ def command_line_entry(*args):

for fn in args.files:
if not os.path.exists(fn):
print("File '{0}' does not exist!".format(fn))
print(f"File '{fn}' does not exist!")
continue
try:
print("'{0}' : {1}".format(fn, from_file(fn, args.mime)))
print(f"'{fn}' : {from_file(fn, args.mime)}")
except PureError:
print("'{0}' : could not be Identified".format(fn))
print(f"'{fn}' : could not be Identified")


if __name__ == "__main__":
Expand Down
3 changes: 1 addition & 2 deletions scripts/parse_ftk_kessler_sigs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This is a very ugly helper script to keep up to date with file types in
Gary C. Kessler's FTK_sigs_GCK archive.
Expand Down Expand Up @@ -60,6 +59,6 @@
elif sig["SIG"] not in known_sigs:
for ext in sig["EXT_NAME"]:
if ext != "(none)":
print("\t\t{},".format(json.dumps([sig["SIG"], offset, ".{}".format(ext), "", sig["DESCRIPTION"]])))
print("\t\t{},".format(json.dumps([sig["SIG"], offset, f".{ext}", "", sig["DESCRIPTION"]])))
else:
print("\t\t{},".format(json.dumps([sig["SIG"], offset, "", "", sig["DESCRIPTION"]])))
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from setuptools import setup
import os
Expand All @@ -8,12 +7,12 @@

root = os.path.abspath(os.path.dirname(__file__))

with open(os.path.join(root, "puremagic", "main.py"), "r") as reuse_file:
with open(os.path.join(root, "puremagic", "main.py")) as reuse_file:
reuse_content = reuse_file.read()

attrs = dict(re.findall(r"__([a-z]+)__ *= *['\"](.+)['\"]", reuse_content))

with open("README.rst", "r") as readme_file:
with open("README.rst") as readme_file:
long_description = readme_file.read()

setup(
Expand Down
3 changes: 1 addition & 2 deletions test/test_common_extensions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
import unittest
from tempfile import NamedTemporaryFile
import os
Expand Down Expand Up @@ -55,7 +54,7 @@ def group_test(self, directory):
if ext_failures:
raise AssertionError(
"The following files did not have the expected extensions: {}".format(
", ".join(['"{}" expected "{}"'.format(item, ext) for item, ext in ext_failures])
", ".join([f'"{item}" expected "{ext}"' for item, ext in ext_failures])
)
)
if mime_failures:
Expand Down

0 comments on commit e0d70ed

Please sign in to comment.