Skip to content

Generate labels JSON from the dataset's folders

Alex edited this page Apr 11, 2022 · 1 revision

Tinyscript for achieving this:

#!/usr/bin/env python
from tinyscript import *

PATTERNS = {'not-packed': None, 'BeRoEXEPacker': "bero"}

if __name__ == '__main__':
    parser.add_argument("dataset", help="dataset whose labels are to be computed")
    parser.add_argument("-e", "--exclude", nargs="*", default=[".git"], action="extend", help="excluded folders")
    parser.add_argument("-o", "--output", default="labels.json", help="output labels JSON file")
    initialize()
    labels, files = {}, {}
    for f in ts.Path(args.dataset).walk():
        if any(x in args.exclude for x in f.parts) or not ts.is_file(f) or f.extension in [".ini", ".md"]:
            continue
        logger.info("Processing %s" % f)
        label = PATTERNS.get(f.parts[-2], re.sub(r"[\s-]", "_", f.parts[-2].lower()))
        h = hashlib.sha256_file(f)
        if h in labels:
            logger.warning("Duplicate: %s (of %s)" % (f, files[h]))
            f.remove()
        else:
            labels[h], files[h] = label, str(f)
    with open(args.output, 'w') as f:
        json.dump(labels, f, indent=4)

Usage (from repo's root): python3 labels-generator.py .