Skip to content

Commit bbb58a2

Browse files
jcrussellqkaiser
authored andcommitted
feat(handler): add support for MSI files
Extracts MSIs using 7z with custom CFBF header parsing to compute the full archive size. Works on both vanilla and padded MSI files. This could be migrated to a fully Python-based implementation in the future using: * https://github.com/nightlark/pymsi * https://github.com/decalage2/olefile As of v0.47, olefile does not handle padded MSIs properly so we re-implement CFBF header parsing and compute the archive size ourselves. Implement a complete Compound File FAT traversal: parse header, collect every FAT sector by following the DIFAT chain, read each FAT sector, and locate the highest allocated sector so we can compute MSI chunk size even for large archives exceeding the 109 header FAT entries.
1 parent 9e13e9d commit bbb58a2

File tree

353 files changed

+1324
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

353 files changed

+1324
-1
lines changed

docs/handlers.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
| [`LZIP`](#lzip) | COMPRESSION | :octicons-check-16: |
4141
| [`LZMA`](#lzma) | COMPRESSION | :octicons-check-16: |
4242
| [`LZO`](#lzo) | COMPRESSION | :octicons-check-16: |
43+
| [`MSI`](#msi) | ARCHIVE | :octicons-alert-fill-12: |
4344
| [`MULTI-SEVENZIP`](#multi-sevenzip) | ARCHIVE | :octicons-check-16: |
4445
| [`NETGEAR CHK`](#netgear-chk) | ARCHIVE | :octicons-check-16: |
4546
| [`NETGEAR TRX V1`](#netgear-trx-v1) | ARCHIVE | :octicons-check-16: |
@@ -718,6 +719,28 @@
718719

719720
- [LZO File Format Documentation](http://www.lzop.org/){ target="_blank" }
720721
- [LZO Wikipedia](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer){ target="_blank" }
722+
## MSI
723+
724+
!!! warning "Partially supported"
725+
726+
=== "Description"
727+
728+
Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.
729+
730+
---
731+
732+
- **Handler type:** Archive
733+
- **Vendor:** Microsoft
734+
735+
=== "References"
736+
737+
- [MSI File Format Documentation](https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer){ target="_blank" }
738+
- [Compound File Binary Format](https://en.wikipedia.org/wiki/Compound_File_Binary_Format){ target="_blank" }
739+
740+
=== "Limitations"
741+
742+
- Limited to CFB based extraction, not full-on MSI extraction
743+
- Extracted files have names coming from CFB internal representation, and may not correspond to the one they would have on disk after running the installer
721744
## multi-sevenzip
722745

723746
!!! success "Fully supported"

python/unblob/handlers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
cab,
77
cpio,
88
dmg,
9+
msi,
910
par2,
1011
partclone,
1112
rar,
@@ -89,6 +90,7 @@
8990
arc.ARCHandler,
9091
arj.ARJHandler,
9192
cab.CABHandler,
93+
msi.MsiHandler,
9294
tar.TarUstarHandler,
9395
tar.TarUnixHandler,
9496
cpio.PortableASCIIHandler,
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
import io
2+
import struct
3+
from typing import List, Optional
4+
5+
from structlog import get_logger
6+
7+
from unblob.extractors import Command
8+
9+
from ...file_utils import InvalidInputFormat
10+
from ...models import (
11+
File,
12+
HandlerDoc,
13+
HandlerType,
14+
HexString,
15+
Reference,
16+
StructHandler,
17+
ValidChunk,
18+
)
19+
20+
FREE_SECTOR = 0xFFFFFFFF
21+
END_OF_CHAIN = 0xFFFFFFFE
22+
HEADER_SIZE = 512
23+
24+
logger = get_logger()
25+
26+
27+
class MsiHandler(StructHandler):
28+
NAME = "msi"
29+
30+
PATTERNS = [HexString("D0 CF 11 E0 A1 B1 1A E1")]
31+
C_DEFINITIONS = r"""
32+
typedef struct cfbf_header
33+
{
34+
// [offset from start (bytes), length (bytes)]
35+
uint8 signature[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
36+
// 0x1a, 0xe1} for current version
37+
uint8 clsid[16]; // [08H,16] reserved must be zero (WriteClassStg/
38+
// GetClassFile uses root directory class id)
39+
uint16 minorVersion; // [18H,02] minor version of the format: 33 is
40+
// written by reference implementation
41+
uint16 dllVersion; // [1AH,02] major version of the dll/format: 3 for
42+
// 512-byte sectors, 4 for 4 KB sectors
43+
uint16 byteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
44+
uint16 sectorShift; // [1EH,02] size of sectors in power-of-two;
45+
// typically 9 indicating 512-byte sectors
46+
uint16 miniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
47+
// typically 6 indicating 64-byte mini-sectors
48+
uint16 reserved; // [22H,02] reserved, must be zero
49+
uint32 reserved1; // [24H,04] reserved, must be zero
50+
uint32 csectDir; // [28H,04] must be zero for 512-byte sectors,
51+
// number of SECTs in directory chain for 4 KB
52+
// sectors
53+
uint32 csectFat; // [2CH,04] number of SECTs in the FAT chain
54+
uint32 sectDirStart; // [30H,04] first SECT in the directory chain
55+
uint32 txSignature; // [34H,04] signature used for transactions; must
56+
// be zero. The reference implementation
57+
// does not support transactions
58+
uint32 miniSectorCutoff; // [38H,04] maximum size for a mini stream;
59+
// typically 4096 bytes
60+
uint32 sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
61+
uint32 csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
62+
uint32 sectDifStart; // [44H,04] first SECT in the DIFAT chain
63+
uint32 csectDif; // [48H,04] number of SECTs in the DIFAT chain
64+
uint32 sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
65+
} cfbf_header_t;
66+
"""
67+
HEADER_STRUCT = "cfbf_header_t"
68+
69+
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
70+
71+
DOC = HandlerDoc(
72+
name="MSI",
73+
description="Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.",
74+
handler_type=HandlerType.ARCHIVE,
75+
vendor="Microsoft",
76+
references=[
77+
Reference(
78+
title="MSI File Format Documentation",
79+
url="https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer",
80+
),
81+
Reference(
82+
title="Compound File Binary Format",
83+
url="https://en.wikipedia.org/wiki/Compound_File_Binary_Format",
84+
),
85+
],
86+
limitations=[
87+
"Limited to CFB based extraction, not full-on MSI extraction",
88+
"Extracted files have names coming from CFB internal representation, and may not correspond to the one they would have on disk after running the installer",
89+
],
90+
)
91+
92+
def _read_sector(
93+
self, file: File, start_offset: int, sector_size: int, sector_id: int
94+
) -> bytes:
95+
# All sectors, including the fixed-size header, occupy full sector_size
96+
sector_offset = start_offset + sector_size + sector_id * sector_size
97+
if sector_offset > file.size():
98+
raise InvalidInputFormat("Invalid MSI file, sector offset too large")
99+
100+
file.seek(sector_offset, io.SEEK_SET)
101+
raw_sector = file.read(sector_size)
102+
if len(raw_sector) != sector_size:
103+
raise InvalidInputFormat("Invalid MSI file, sector shorter than expected")
104+
105+
return raw_sector
106+
107+
def _collect_fat_sectors(
108+
self,
109+
file: File,
110+
header,
111+
start_offset: int,
112+
sector_size: int,
113+
entries_per_sector: int,
114+
) -> List[int]:
115+
fat_sectors: List[int] = []
116+
117+
def maybe_add_sector(sector_id: int) -> bool:
118+
if sector_id == FREE_SECTOR:
119+
return False
120+
121+
fat_sectors.append(sector_id)
122+
return len(fat_sectors) == header.csectFat
123+
124+
for sect in header.sectFat:
125+
if maybe_add_sector(sect):
126+
return fat_sectors
127+
128+
if len(fat_sectors) == header.csectFat:
129+
return fat_sectors
130+
131+
# The remaining FAT sector ids live inside the DIFAT chain. The last
132+
# entry in each DIFAT sector points to the next DIFAT sector.
133+
difat_sector = header.sectDifStart
134+
135+
for _ in range(header.csectDif):
136+
if difat_sector in (FREE_SECTOR, END_OF_CHAIN):
137+
break
138+
139+
raw_sector = self._read_sector(
140+
file, start_offset, sector_size, difat_sector
141+
)
142+
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)
143+
144+
difat_sector = entries[-1]
145+
for entry in entries[:-1]:
146+
if maybe_add_sector(entry):
147+
return fat_sectors
148+
149+
if len(fat_sectors) != header.csectFat:
150+
raise InvalidInputFormat("Invalid MSI file, incomplete FAT chain")
151+
152+
return fat_sectors
153+
154+
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
155+
file.seek(start_offset, io.SEEK_SET)
156+
header = self.parse_header(file)
157+
158+
sector_size = 2**header.sectorShift
159+
entries_per_sector = sector_size // 4
160+
161+
if sector_size < HEADER_SIZE:
162+
raise InvalidInputFormat("Invalid MSI file, sector smaller than header")
163+
164+
if header.csectFat == 0:
165+
raise InvalidInputFormat("Invalid MSI file, FAT chain is empty")
166+
167+
fat_sectors = self._collect_fat_sectors(
168+
file, header, start_offset, sector_size, entries_per_sector
169+
)
170+
171+
max_used_sector = 0
172+
for sector_index, sect in enumerate(fat_sectors):
173+
raw_sector = self._read_sector(file, start_offset, sector_size, sect)
174+
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)
175+
176+
base_sector_id = sector_index * entries_per_sector
177+
for entry_id in range(len(entries) - 1, -1, -1):
178+
if entries[entry_id] == FREE_SECTOR:
179+
continue
180+
181+
max_id = base_sector_id + entry_id
182+
max_used_sector = max(max_used_sector, max_id)
183+
break
184+
185+
total_size = sector_size + ((max_used_sector + 1) * sector_size)
186+
187+
return ValidChunk(
188+
start_offset=start_offset,
189+
end_offset=start_offset + total_size,
190+
)

python/unblob/processing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
DEFAULT_PROCESS_NUM = multiprocessing.cpu_count()
5555
DEFAULT_SKIP_MAGIC = (
5656
"BFLT",
57-
"Composite Document File V2 Document",
57+
# Disabled for MSI files
58+
# "Composite Document File V2 Document",
5859
"Erlang BEAM file",
5960
"GIF",
6061
"GNU message catalog",

tests/handlers/archive/test_msi.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import struct
2+
3+
import pytest
4+
5+
from unblob.file_utils import File
6+
from unblob.handlers.archive.msi import (
7+
END_OF_CHAIN,
8+
FREE_SECTOR,
9+
MsiHandler,
10+
)
11+
12+
13+
def _build_msi_with_sector_shift(sector_shift: int) -> bytes:
14+
sector_size = 1 << sector_shift
15+
16+
header = bytearray(sector_size)
17+
header[:8] = bytes.fromhex("D0 CF 11 E0 A1 B1 1A E1")
18+
19+
dll_version = 4 if sector_shift >= 12 else 3
20+
# Offsets and values taken from the CFBF header specification
21+
struct.pack_into(
22+
"<HHHHHH",
23+
header,
24+
0x18,
25+
0x0033,
26+
dll_version,
27+
0xFFFE,
28+
sector_shift,
29+
6,
30+
0,
31+
)
32+
struct.pack_into("<I", header, 0x2C, 1) # csectFat
33+
struct.pack_into("<I", header, 0x38, 4096) # miniSectorCutoff
34+
struct.pack_into("<I", header, 0x3C, FREE_SECTOR) # sectMiniFatStart
35+
struct.pack_into("<I", header, 0x44, FREE_SECTOR) # sectDifStart
36+
37+
sect_fat_entries = [FREE_SECTOR] * 109
38+
sect_fat_entries[0] = 0
39+
for index, entry in enumerate(sect_fat_entries):
40+
struct.pack_into("<I", header, 0x4C + index * 4, entry)
41+
42+
entries_per_sector = sector_size // 4
43+
fat_sector = bytearray(sector_size)
44+
fat_entries = [END_OF_CHAIN] + [FREE_SECTOR] * (entries_per_sector - 1)
45+
for index, entry in enumerate(fat_entries):
46+
struct.pack_into("<I", fat_sector, index * 4, entry)
47+
48+
return bytes(header + fat_sector)
49+
50+
51+
@pytest.mark.parametrize("sector_shift", [9, 12])
52+
def test_calculate_chunk_respects_sector_size(sector_shift: int):
53+
handler = MsiHandler()
54+
55+
msi_content = _build_msi_with_sector_shift(sector_shift)
56+
prefix = b"prefix"
57+
file = File.from_bytes(prefix + msi_content)
58+
59+
chunk = handler.calculate_chunk(file, len(prefix))
60+
61+
assert chunk is not None
62+
assert chunk.start_offset == len(prefix)
63+
assert chunk.end_offset == len(prefix) + len(msi_content)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:dce9e456ace76b969fe0fe4d228bf096662c11d2376d99a9210f6364428a94c4
3+
size 1563648
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:da8f4120ab4ffacb19067a26f6a8b2695e00ec19bcc48ff694349c62df1b330b
3+
size 1563680
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:aa8e5036d973688f1e8622fbe9ab22e037346e0def0197bf5e7cdf37da4e223d
3+
size 3831808
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:12c87c542e1d4a39b47f176ffa5fd1691c98e5f9d502e6e46573962fb77c4510
3+
size 3831840
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:374708fff7719dd5979ec875d56cd2286f6d3cf7ec317a3b25632aab28ec37bb
3+
size 16

0 commit comments

Comments
 (0)