Skip to content

Commit 7759f54

Browse files
jcrussellqkaiser
authored andcommitted
feat(handler): add support for MSI files
Seems to work but had to DIY the parsing due to issues with olefile. feat(handler): fix issues with MSI handler Seems to work on both vanilla and padded MSI files. This could be migrated to a fully Python-based implementation in the future using: * https://github.com/nightlark/pymsi * https://github.com/decalage2/olefile As of v0.47, olefile does not handle padded MSIs properly so we re-implement CFBF header parsing and compute the archive size ourselves.
1 parent 9e13e9d commit 7759f54

File tree

352 files changed

+1194
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

352 files changed

+1194
-1
lines changed

docs/handlers.md

Lines changed: 18 additions & 0 deletions

python/unblob/handlers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
cab,
77
cpio,
88
dmg,
9+
msi,
910
par2,
1011
partclone,
1112
rar,
@@ -89,6 +90,7 @@
8990
arc.ARCHandler,
9091
arj.ARJHandler,
9192
cab.CABHandler,
93+
msi.MsiHandler,
9294
tar.TarUstarHandler,
9395
tar.TarUnixHandler,
9496
cpio.PortableASCIIHandler,
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
"""MSI Handler.
2+
3+
Extracts MSIs using 7z with custom CFBF header parsing to compute the full
4+
archive size.
5+
"""
6+
7+
import struct
8+
from typing import Optional
9+
10+
from structlog import get_logger
11+
12+
from unblob.extractors import Command
13+
14+
from ...models import (
15+
File,
16+
HandlerDoc,
17+
HandlerType,
18+
HexString,
19+
Reference,
20+
StructHandler,
21+
ValidChunk,
22+
)
23+
24+
logger = get_logger()
25+
26+
27+
class MsiHandler(StructHandler):
28+
NAME = "msi"
29+
30+
PATTERNS = [HexString("D0 CF 11 E0 A1 B1 1A E1")]
31+
C_DEFINITIONS = r"""
32+
typedef struct cfbf_header
33+
{
34+
// [offset from start (bytes), length (bytes)]
35+
uint8 signature[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
36+
// 0x1a, 0xe1} for current version
37+
uint8 clsid[16]; // [08H,16] reserved must be zero (WriteClassStg/
38+
// GetClassFile uses root directory class id)
39+
uint16 minorVersion; // [18H,02] minor version of the format: 33 is
40+
// written by reference implementation
41+
uint16 dllVersion; // [1AH,02] major version of the dll/format: 3 for
42+
// 512-byte sectors, 4 for 4 KB sectors
43+
uint16 byteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
44+
uint16 sectorShift; // [1EH,02] size of sectors in power-of-two;
45+
// typically 9 indicating 512-byte sectors
46+
uint16 miniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
47+
// typically 6 indicating 64-byte mini-sectors
48+
uint16 reserved; // [22H,02] reserved, must be zero
49+
uint32 reserved1; // [24H,04] reserved, must be zero
50+
uint32 csectDir; // [28H,04] must be zero for 512-byte sectors,
51+
// number of SECTs in directory chain for 4 KB
52+
// sectors
53+
uint32 csectFat; // [2CH,04] number of SECTs in the FAT chain
54+
uint32 sectDirStart; // [30H,04] first SECT in the directory chain
55+
uint32 txSignature; // [34H,04] signature used for transactions; must
56+
// be zero. The reference implementation
57+
// does not support transactions
58+
uint32 miniSectorCutoff; // [38H,04] maximum size for a mini stream;
59+
// typically 4096 bytes
60+
uint32 sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
61+
uint32 csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
62+
uint32 sectDifStart; // [44H,04] first SECT in the DIFAT chain
63+
uint32 csectDif; // [48H,04] number of SECTs in the DIFAT chain
64+
uint32 sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
65+
} cfbf_header_t;
66+
"""
67+
HEADER_STRUCT = "cfbf_header_t"
68+
69+
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
70+
71+
DOC = HandlerDoc(
72+
name="MSI",
73+
description="Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.",
74+
handler_type=HandlerType.ARCHIVE,
75+
vendor="Microsoft",
76+
references=[
77+
Reference(
78+
title="MSI File Format Documentation",
79+
url="https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer",
80+
),
81+
Reference(
82+
title="Compound File Binary Format",
83+
url="https://en.wikipedia.org/wiki/Compound_File_Binary_Format",
84+
),
85+
],
86+
limitations=[],
87+
)
88+
89+
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
90+
file.seek(start_offset)
91+
header = self.parse_header(file)
92+
93+
# Size of MSI is based on the maximum used sector. Need to walk the
94+
# DIFAT entries and find the maximum used sector to compute the size.
95+
sector_size = 2**header.sectorShift
96+
entries_per_sector = sector_size // 4
97+
98+
max_used_sector = 0
99+
100+
for sector_id, sect in enumerate(header.sectFat):
101+
# skip empty
102+
if sect == 0xFFFFFFFF:
103+
continue
104+
105+
file.seek(start_offset + 512 + sect * sector_size)
106+
raw_sector = file.read(sector_size)
107+
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)
108+
109+
base_sector_id = sector_id * entries_per_sector
110+
for entry_id in range(len(entries) - 1, -1, -1):
111+
if entries[entry_id] == 0xFFFFFFFF:
112+
continue
113+
114+
# Found the highest id on this page
115+
max_id = base_sector_id + entry_id
116+
117+
max_used_sector = max(max_used_sector, max_id)
118+
119+
# Once we have found the first non-empty element, we are done
120+
# with all IDs in this sector
121+
break
122+
123+
total_size = 512 + ((max_used_sector + 1) * sector_size)
124+
125+
return ValidChunk(
126+
start_offset=start_offset,
127+
end_offset=start_offset + total_size,
128+
)

python/unblob/processing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
DEFAULT_PROCESS_NUM = multiprocessing.cpu_count()
5555
DEFAULT_SKIP_MAGIC = (
5656
"BFLT",
57-
"Composite Document File V2 Document",
57+
# Disabled for MSI files
58+
# "Composite Document File V2 Document",
5859
"Erlang BEAM file",
5960
"GIF",
6061
"GNU message catalog",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:dce9e456ace76b969fe0fe4d228bf096662c11d2376d99a9210f6364428a94c4
3+
size 1563648
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:da8f4120ab4ffacb19067a26f6a8b2695e00ec19bcc48ff694349c62df1b330b
3+
size 1563680
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:aa8e5036d973688f1e8622fbe9ab22e037346e0def0197bf5e7cdf37da4e223d
3+
size 3831808
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:12c87c542e1d4a39b47f176ffa5fd1691c98e5f9d502e6e46573962fb77c4510
3+
size 3831840
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:374708fff7719dd5979ec875d56cd2286f6d3cf7ec317a3b25632aab28ec37bb
3+
size 16
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:374708fff7719dd5979ec875d56cd2286f6d3cf7ec317a3b25632aab28ec37bb
3+
size 16

0 commit comments

Comments
 (0)