Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
23 changes: 23 additions & 0 deletions docs/handlers.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
| [`LZIP`](#lzip) | COMPRESSION | :octicons-check-16: |
| [`LZMA`](#lzma) | COMPRESSION | :octicons-check-16: |
| [`LZO`](#lzo) | COMPRESSION | :octicons-check-16: |
| [`MSI`](#msi) | ARCHIVE | :octicons-alert-fill-12: |
| [`MULTI-SEVENZIP`](#multi-sevenzip) | ARCHIVE | :octicons-check-16: |
| [`NETGEAR CHK`](#netgear-chk) | ARCHIVE | :octicons-check-16: |
| [`NETGEAR TRX V1`](#netgear-trx-v1) | ARCHIVE | :octicons-check-16: |
Expand Down Expand Up @@ -718,6 +719,28 @@

- [LZO File Format Documentation](http://www.lzop.org/){ target="_blank" }
- [LZO Wikipedia](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer){ target="_blank" }
## MSI

!!! warning "Partially supported"

=== "Description"

Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.

---

- **Handler type:** Archive
- **Vendor:** Microsoft

=== "References"

- [MSI File Format Documentation](https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer){ target="_blank" }
- [Compound File Binary Format](https://en.wikipedia.org/wiki/Compound_File_Binary_Format){ target="_blank" }

=== "Limitations"

- Limited to CFB based extraction, not full-on MSI extraction
- Extracted files have names coming from CFB internal representation, and may not correspond to the one they would have on disk after running the installer
## multi-sevenzip

!!! success "Fully supported"
Expand Down
2 changes: 2 additions & 0 deletions python/unblob/handlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
cab,
cpio,
dmg,
msi,
par2,
partclone,
rar,
Expand Down Expand Up @@ -89,6 +90,7 @@
arc.ARCHandler,
arj.ARJHandler,
cab.CABHandler,
msi.MsiHandler,
tar.TarUstarHandler,
tar.TarUnixHandler,
cpio.PortableASCIIHandler,
Expand Down
203 changes: 203 additions & 0 deletions python/unblob/handlers/archive/msi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import io
import struct
from typing import Optional

from structlog import get_logger

from unblob.extractors import Command

from ...file_utils import InvalidInputFormat
from ...models import (
File,
HandlerDoc,
HandlerType,
HexString,
Reference,
StructHandler,
ValidChunk,
)

FREE_SECTOR = 0xFFFFFFFF
END_OF_CHAIN = 0xFFFFFFFE
HEADER_SIZE = 512

logger = get_logger()


class MsiHandler(StructHandler):
NAME = "msi"

PATTERNS = [HexString("D0 CF 11 E0 A1 B1 1A E1")]
C_DEFINITIONS = r"""
typedef struct cfbf_header
{
// [offset from start (bytes), length (bytes)]
uint8 signature[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
// 0x1a, 0xe1} for current version
uint8 clsid[16]; // [08H,16] reserved must be zero (WriteClassStg/
// GetClassFile uses root directory class id)
uint16 minorVersion; // [18H,02] minor version of the format: 33 is
// written by reference implementation
uint16 dllVersion; // [1AH,02] major version of the dll/format: 3 for
// 512-byte sectors, 4 for 4 KB sectors
uint16 byteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
uint16 sectorShift; // [1EH,02] size of sectors in power-of-two;
// typically 9 indicating 512-byte sectors
uint16 miniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
// typically 6 indicating 64-byte mini-sectors
uint16 reserved; // [22H,02] reserved, must be zero
uint32 reserved1; // [24H,04] reserved, must be zero
uint32 csectDir; // [28H,04] must be zero for 512-byte sectors,
// number of SECTs in directory chain for 4 KB
// sectors
uint32 csectFat; // [2CH,04] number of SECTs in the FAT chain
uint32 sectDirStart; // [30H,04] first SECT in the directory chain
uint32 txSignature; // [34H,04] signature used for transactions; must
// be zero. The reference implementation
// does not support transactions
uint32 miniSectorCutoff; // [38H,04] maximum size for a mini stream;
// typically 4096 bytes
uint32 sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
uint32 csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
uint32 sectDifStart; // [44H,04] first SECT in the DIFAT chain
uint32 csectDif; // [48H,04] number of SECTs in the DIFAT chain
uint32 sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
} cfbf_header_t;
"""
HEADER_STRUCT = "cfbf_header_t"

EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")

DOC = HandlerDoc(
name="MSI",
description="Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.",
handler_type=HandlerType.ARCHIVE,
vendor="Microsoft",
references=[
Reference(
title="MSI File Format Documentation",
url="https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer",
),
Reference(
title="Compound File Binary Format",
url="https://en.wikipedia.org/wiki/Compound_File_Binary_Format",
),
],
limitations=[
"Limited to CFB based extraction, not full-on MSI extraction",
"Extracted files have names coming from CFB internal representation, and may not correspond to the one they would have on disk after running the installer",
],
)

def _read_sector(
self, file: File, start_offset: int, sector_size: int, sector_id: int
) -> bytes:
# All sectors, including the fixed-size header, occupy full sector_size
sector_offset = start_offset + sector_size + sector_id * sector_size
if sector_offset > file.size():
raise InvalidInputFormat("Invalid MSI file, sector offset too large")

file.seek(sector_offset, io.SEEK_SET)
raw_sector = file.read(sector_size)
if len(raw_sector) != sector_size:
raise InvalidInputFormat("Invalid MSI file, sector shorter than expected")

return raw_sector

def _append_fat_sector(
self, fat_sectors: list[int], sector_id: int, required_count: int
) -> bool:
if sector_id == FREE_SECTOR:
return False

fat_sectors.append(sector_id)
return len(fat_sectors) >= required_count

def _extend_fat_from_difat(
self,
file: File,
header,
start_offset: int,
sector_size: int,
entries_per_sector: int,
fat_sectors: list[int],
) -> None:
difat_sector = header.sectDifStart

for _ in range(header.csectDif):
if difat_sector in (FREE_SECTOR, END_OF_CHAIN):
break

raw_sector = self._read_sector(
file, start_offset, sector_size, difat_sector
)
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)

difat_sector = entries[-1]
for entry in entries[:-1]:
if self._append_fat_sector(
fat_sectors, entry, required_count=header.csectFat
):
return

def _collect_fat_sectors(
self,
file: File,
header,
start_offset: int,
sector_size: int,
entries_per_sector: int,
) -> list[int]:
fat_sectors: list[int] = []

for sect in header.sectFat:
if self._append_fat_sector(fat_sectors, sect, header.csectFat):
return fat_sectors

if len(fat_sectors) < header.csectFat:
self._extend_fat_from_difat(
file, header, start_offset, sector_size, entries_per_sector, fat_sectors
)

if len(fat_sectors) != header.csectFat:
raise InvalidInputFormat("Invalid MSI file, incomplete FAT chain")

return fat_sectors

def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
file.seek(start_offset, io.SEEK_SET)
header = self.parse_header(file)

sector_size = 2**header.sectorShift
entries_per_sector = sector_size // 4

if sector_size < HEADER_SIZE:
raise InvalidInputFormat("Invalid MSI file, sector smaller than header")

if header.csectFat == 0:
raise InvalidInputFormat("Invalid MSI file, FAT chain is empty")

fat_sectors = self._collect_fat_sectors(
file, header, start_offset, sector_size, entries_per_sector
)

max_used_sector = 0
for sector_index, sect in enumerate(fat_sectors):
raw_sector = self._read_sector(file, start_offset, sector_size, sect)
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)

base_sector_id = sector_index * entries_per_sector
for entry_id in range(len(entries) - 1, -1, -1):
if entries[entry_id] == FREE_SECTOR:
continue

max_id = base_sector_id + entry_id
max_used_sector = max(max_used_sector, max_id)
break

total_size = sector_size + ((max_used_sector + 1) * sector_size)

return ValidChunk(
start_offset=start_offset,
end_offset=start_offset + total_size,
)
3 changes: 2 additions & 1 deletion python/unblob/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@
DEFAULT_PROCESS_NUM = multiprocessing.cpu_count()
DEFAULT_SKIP_MAGIC = (
"BFLT",
"Composite Document File V2 Document",
# Disabled for MSI files
# "Composite Document File V2 Document",
"Erlang BEAM file",
"GIF",
"GNU message catalog",
Expand Down
63 changes: 63 additions & 0 deletions tests/handlers/archive/test_msi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import struct

import pytest

from unblob.file_utils import File
from unblob.handlers.archive.msi import (
END_OF_CHAIN,
FREE_SECTOR,
MsiHandler,
)


def _build_msi_with_sector_shift(sector_shift: int) -> bytes:
sector_size = 1 << sector_shift

header = bytearray(sector_size)
header[:8] = bytes.fromhex("D0 CF 11 E0 A1 B1 1A E1")

dll_version = 4 if sector_shift >= 12 else 3
# Offsets and values taken from the CFBF header specification
struct.pack_into(
"<HHHHHH",
header,
0x18,
0x0033,
dll_version,
0xFFFE,
sector_shift,
6,
0,
)
struct.pack_into("<I", header, 0x2C, 1) # csectFat
struct.pack_into("<I", header, 0x38, 4096) # miniSectorCutoff
struct.pack_into("<I", header, 0x3C, FREE_SECTOR) # sectMiniFatStart
struct.pack_into("<I", header, 0x44, FREE_SECTOR) # sectDifStart

sect_fat_entries = [FREE_SECTOR] * 109
sect_fat_entries[0] = 0
for index, entry in enumerate(sect_fat_entries):
struct.pack_into("<I", header, 0x4C + index * 4, entry)

entries_per_sector = sector_size // 4
fat_sector = bytearray(sector_size)
fat_entries = [END_OF_CHAIN] + [FREE_SECTOR] * (entries_per_sector - 1)
for index, entry in enumerate(fat_entries):
struct.pack_into("<I", fat_sector, index * 4, entry)

return bytes(header + fat_sector)


@pytest.mark.parametrize("sector_shift", [9, 12])
def test_calculate_chunk_respects_sector_size(sector_shift: int):
handler = MsiHandler()

msi_content = _build_msi_with_sector_shift(sector_shift)
prefix = b"prefix"
file = File.from_bytes(prefix + msi_content)

chunk = handler.calculate_chunk(file, len(prefix))

assert chunk is not None
assert chunk.start_offset == len(prefix)
assert chunk.end_offset == len(prefix) + len(msi_content)
3 changes: 3 additions & 0 deletions tests/integration/archive/msi/__input__/7z2501.msi
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/integration/archive/msi/__input__/7z2501.msi.padded
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Loading
Loading