diff --git a/.agents/issue-227-ledger.yml b/.agents/issue-227-ledger.yml new file mode 100644 index 00000000..b5141404 --- /dev/null +++ b/.agents/issue-227-ledger.yml @@ -0,0 +1,367 @@ +version: 1 +issue: 227 +base: main +branch: codex/issue-227 +tasks: + - id: task-01 + title: Add `mapping_diff_report` console script entrypoint to `pyproject.toml` + under `[project.scripts]` pointing to `.cli.mapping_diff_report:main` + status: done + started_at: '2026-02-23T18:21:21Z' + finished_at: '2026-02-23T18:21:38Z' + commit: 1257ec8041e7df00837f982b536348187dc3cea7 + notes: [] + - id: task-02 + title: Create `src//cli/mapping_diff_report.py` with argument parser + that supports `--help` flag + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-03 + title: Implement error handling that exits non-zero and writes single-line stderr + message including `config/name_registry.yml` path when registry is missing or + unreadable + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-04 + title: Wire the CLI main function to call the report generator and write output + to stdout + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-05 + title: Add exit code logic to return zero on success and non-zero on fatal errors + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-06 + title: Create `src//reports/mapping_diff.py` with a callable report generator + function signature that accepts registry path and input sources + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-07 + title: Implement registry loading logic within the report generator using existing + config patterns + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-08 + title: Implement input scanning logic that identifies unmapped names and fallback-mapped + names from normalization and reconciliation sources + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-09 + title: Implement deterministic `UNMAPPED` section generation that lists raw input + names not present in registry + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-10 + title: Implement deterministic `FALLBACK_MAPPED` section generation that lists + input names resolved via fallback with their canonical names + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-11 + title: Implement deterministic `SUGGESTIONS` section generation that provides + canonical name suggestions for every unmapped entry using title-case transformation + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-12 + title: Identify the specific normalization and reconciliation functions that perform + counterparty name resolution + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-13 + title: Modify the name resolution logic to consult the registry for direct canonical + and alias lookups before applying hardcoded mappings + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-14 + title: Extend the return type or object from name resolution functions to include + a `source` field indicating `registry` or `fallback` origin + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-15 + title: Update all call sites of the modified resolution functions to handle the + new `source` field in the return value + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-16 + title: Add unit test in `tests/test_mapping_diff_report_cli.py` that verifies + `mapping_diff_report --help` exits with status zero and prints usage text containing + `mapping_diff_report` string + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-17 + title: Add unit test that verifies missing `config/name_registry.yml` causes non-zero + exit and single-line stderr message containing the registry path + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-18 + title: Add unit test that verifies unreadable `config/name_registry.yml` causes + non-zero exit and appropriate stderr message + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-19 + title: Add unit test that verifies deterministic output against fixed fixtures + contains all three required sections with expected content + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-20 + title: Add integration test in `tests/test_normalization_registry_first.py` using + `name_registry_before.yml` that verifies at least one fixture input resolves + via fallback and appears in `FALLBACK_MAPPED` section + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-21 + title: Add integration test using `name_registry_after.yml` with same inputs that + verifies previously fallback-mapped name now resolves via registry + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-22 + title: Add integration test that captures logs when using `name_registry_after.yml` + and asserts no warning messages contain the previously fallback-mapped raw name + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-23 + title: Add integration test that verifies `mapping_diff_report` output changes + between before and after registry states for the same input set + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-24 + title: Create `tests/fixtures/name_registry_before.yml` with at least one missing + alias that will trigger fallback resolution + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-25 + title: Create `tests/fixtures/name_registry_after.yml` with the previously missing + alias added + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-26 + title: Add normalization/reconciliation input fixtures required for integration + tests + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-27 + title: Update fixture loading in tests to use explicit fixture selection via temp + working dir or dependency injection (not real `config/name_registry.yml`) + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-28 + title: 'Update documentation (README.md or docs page) to include ordered workflow: + (1) edit `config/name_registry.yml`, (2) run `mapping_diff_report`, (3) interpret + `UNMAPPED`, `FALLBACK_MAPPED`, `SUGGESTIONS` sections' + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-29 + title: Document which files in the current PR branch diff are registry/report/normalization/tests/docs-related + and which are unrelated, then create a separate branch containing only the in-scope + changes + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-30 + title: '`pyproject.toml` defines a `[project.scripts]` console entrypoint named + `mapping_diff_report`' + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-31 + title: Running `mapping_diff_report --help` exits with status code `0` and prints + usage text that includes the string `mapping_diff_report` + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-32 + title: If `config/name_registry.yml` is missing or unreadable, `mapping_diff_report` + exits non-zero and writes a single-line error message to stderr that includes + `config/name_registry.yml` + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-33 + title: '`src//reports/mapping_diff.py` exists and can be imported without + performing IO at import time' + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-34 + title: 'With fixed fixture registry + fixed fixture normalization/reconciliation + inputs, `mapping_diff_report` output is deterministic and contains three labeled + sections: `UNMAPPED`, `FALLBACK_MAPPED`, and `SUGGESTIONS`' + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-35 + title: The `UNMAPPED` section lists each input name not present in the registry + fixture one per line and prints the raw input name exactly as encountered + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-36 + title: The `FALLBACK_MAPPED` section lists each input name resolved by fallback + logic (not registry alias) and includes both the raw input name and resolved + canonical name on each line + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-37 + title: The `SUGGESTIONS` section includes a non-empty suggested canonical name + for every entry in `UNMAPPED`, and each suggestion line follows the format ` + -> ` where `suggested_canonical_name` is generated + using title-case transformation + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-38 + title: Normalization/reconciliation code consults the name registry before any + hardcoded/fallback mappings and records mapping source as `registry` or `fallback` + per mapped name + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-39 + title: 'Integration scenario A: using `tests/fixtures/name_registry_before.yml`, + at least one fixture input resolves via fallback and `mapping_diff_report` lists + it under `FALLBACK_MAPPED`' + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-40 + title: 'Integration scenario B: using `tests/fixtures/name_registry_after.yml` + (same inputs), the previously fallback-mapped name resolves via registry and + does not appear in `FALLBACK_MAPPED` or `UNMAPPED` in `mapping_diff_report` + output' + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-41 + title: When using `tests/fixtures/name_registry_after.yml`, the normalization/reconciliation + run emits no warning log messages containing the previously fallback-mapped + raw name + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-42 + title: 'A documentation file (README.md or a docs page) contains an explicit ordered + workflow with the literal steps: (1) edit `config/name_registry.yml`, (2) run + `mapping_diff_report`, (3) interpret `UNMAPPED`, `FALLBACK_MAPPED`, and `SUGGESTIONS` + sections' + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] + - id: task-43 + title: 'The PR branch diff contains only files matching these patterns: `src//cli/*`, + `src//reports/*`, `src//name_registry.py`, `tests/test_*registry*.py`, + `tests/test_*mapping_diff*.py`, `tests/fixtures/name_registry*.yml`, `config/name_registry.yml`, + `pyproject.toml` (scripts section only), `README.md` or `docs/*.md`' + status: todo + started_at: null + finished_at: null + commit: '' + notes: [] diff --git a/README.md b/README.md index 163a2bce..e60a7b63 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,12 @@ A programmatic replacement for the MOSERS spreadsheet workflow used to evaluate - **Keepalive troubleshooting:** [docs/KEEPALIVE_TROUBLESHOOTING.md](docs/KEEPALIVE_TROUBLESHOOTING.md) - **Consumer setup checklist:** [docs/SETUP_CHECKLIST.md](docs/SETUP_CHECKLIST.md) +## Name Registry Workflow + +1. Edit `config/name_registry.yml`. +2. Run `mapping_diff_report`. +3. Interpret `UNMAPPED`, `FALLBACK_MAPPED`, and `SUGGESTIONS` sections. + ## Repository automation (high level) This repo is integrated with the central [stranske/Workflows](https://github.com/stranske/Workflows) library. diff --git a/pyproject.toml b/pyproject.toml index 42f7e260..986b9a64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,9 @@ dev = [ Homepage = "https://github.com/stranske/Template" Repository = "https://github.com/stranske/Template" +[project.scripts] +mapping_diff_report = "counter_risk.cli.mapping_diff_report:main" + [tool.setuptools.packages.find] where = ["src"] diff --git a/release.spec b/release.spec index d9e12d3b..c654a3cf 100644 --- a/release.spec +++ b/release.spec @@ -23,7 +23,7 @@ if fixture_template.exists(): a = Analysis( - [str(project_root / "src" / "counter_risk" / "cli.py")], + [str(project_root / "src" / "counter_risk" / "cli" / "__main__.py")], pathex=[str(project_root / "src")], binaries=[], datas=datas, diff --git a/src/counter_risk/cli.py b/src/counter_risk/cli/__init__.py similarity index 97% rename from src/counter_risk/cli.py rename to src/counter_risk/cli/__init__.py index f4026338..1b3d3ae0 100644 --- a/src/counter_risk/cli.py +++ b/src/counter_risk/cli/__init__.py @@ -61,7 +61,3 @@ def main(argv: list[str] | None = None) -> int: return 0 command_handler = cast(Callable[[argparse.Namespace], int], handler) return command_handler(args) - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/counter_risk/cli/__main__.py b/src/counter_risk/cli/__main__.py new file mode 100644 index 00000000..70e82f1f --- /dev/null +++ b/src/counter_risk/cli/__main__.py @@ -0,0 +1,8 @@ +"""`python -m counter_risk.cli` entrypoint.""" + +from __future__ import annotations + +from counter_risk.cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/counter_risk/cli/mapping_diff_report.py b/src/counter_risk/cli/mapping_diff_report.py new file mode 100644 index 00000000..51535620 --- /dev/null +++ b/src/counter_risk/cli/mapping_diff_report.py @@ -0,0 +1,62 @@ +"""CLI entrypoint for mapping diff report generation.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from counter_risk.reports.mapping_diff import generate_mapping_diff_report + + +def build_parser() -> argparse.ArgumentParser: + """Build argument parser for mapping_diff_report.""" + + parser = argparse.ArgumentParser( + prog="mapping_diff_report", + description="Generate a deterministic mapping diff report.", + ) + parser.add_argument( + "--registry", + type=Path, + default=Path("config/name_registry.yml"), + help="Path to registry YAML file.", + ) + parser.add_argument( + "--normalization-name", + action="append", + default=[], + help="Raw input name observed during normalization. Can be provided multiple times.", + ) + parser.add_argument( + "--reconciliation-name", + action="append", + default=[], + help="Raw input name observed during reconciliation. Can be provided multiple times.", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + """Run the mapping diff report CLI.""" + + parser = build_parser() + args = parser.parse_args(argv) + + input_sources = { + "normalization": list(args.normalization_name), + "reconciliation": list(args.reconciliation_name), + } + try: + report = generate_mapping_diff_report(args.registry, input_sources) + except ValueError as exc: + error_line = " ".join(str(exc).splitlines()) + print(error_line, file=sys.stderr) + return 1 + + sys.stdout.write(report) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/counter_risk/normalize.py b/src/counter_risk/normalize.py index 0aa53484..0b8fb5dc 100644 --- a/src/counter_risk/normalize.py +++ b/src/counter_risk/normalize.py @@ -14,6 +14,12 @@ from __future__ import annotations import re +from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path +from typing import Literal + +from counter_risk.name_registry import NameRegistryConfig, load_name_registry # Apostrophe variants → ASCII apostrophe _APOSTROPHE_RE = re.compile(r"[\u2018\u2019\u201b\u02bc`]") @@ -22,6 +28,35 @@ _DASH_RE = re.compile(r"[\u2010\u2011\u2012\u2013\u2014\u2015\u2212]") +@dataclass(frozen=True) +class NameResolution: + """A resolved counterparty name and mapping origin.""" + + raw_name: str + canonical_name: str + source: Literal["registry", "fallback", "unmapped"] + + +_COUNTERPARTY_FALLBACK_MAPPINGS = { + "Citigroup": "Citibank", + "Bank of America, NA": "Bank of America", + "Bank of America NA": "Bank of America", + "Goldman Sachs Int'l": "Goldman Sachs", + "Societe Generale": "Soc Gen", + "Barclays Bank PLC": "Barclays", +} + +_CLEARING_HOUSE_FALLBACK_MAPPINGS = { + "CME Clearing House": "CME", + "ICE Clear U.S.": "ICE", + "ICE Clear US": "ICE", + "ICE Clear Europe": "ICE Euro", + "EUREX Clearing": "EUREX", + "Japan Securities Clearing Corporation": "Japan SCC", + "Korea Exchange (in-house)": "Korea Exchange", +} + + def canonicalize_name(name: str) -> str: """Return a deterministic canonical form of *name*. @@ -64,32 +99,53 @@ def _normalize_whitespace(name: str) -> str: return " ".join(name.split()) +@lru_cache(maxsize=8) +def _load_alias_lookup(registry_path: str) -> dict[str, str]: + try: + registry = load_name_registry(Path(registry_path)) + except ValueError: + return {} + return _build_alias_lookup(registry) + + +def _build_alias_lookup(registry: NameRegistryConfig) -> dict[str, str]: + lookup: dict[str, str] = {} + for entry in registry.entries: + lookup[canonicalize_name(entry.canonical_key).casefold()] = entry.display_name + lookup[canonicalize_name(entry.display_name).casefold()] = entry.display_name + for alias in entry.aliases: + lookup[canonicalize_name(alias).casefold()] = entry.display_name + return lookup + + +def resolve_counterparty( + name: str, + *, + registry_path: str | Path = Path("config/name_registry.yml"), +) -> NameResolution: + """Resolve counterparty name with registry-first semantics.""" + + normalized = canonicalize_name(name) + alias_lookup = _load_alias_lookup(str(Path(registry_path).resolve())) + registry_match = alias_lookup.get(normalized.casefold()) + if registry_match is not None: + return NameResolution(raw_name=name, canonical_name=registry_match, source="registry") + + fallback_match = _COUNTERPARTY_FALLBACK_MAPPINGS.get(normalized) + if fallback_match is not None: + return NameResolution(raw_name=name, canonical_name=fallback_match, source="fallback") + + return NameResolution(raw_name=name, canonical_name=normalized, source="unmapped") + + def normalize_counterparty(name: str) -> str: """Normalize a counterparty name to the canonical historical workbook label.""" - mappings = { - "Citigroup": "Citibank", - "Bank of America, NA": "Bank of America", - "Bank of America NA": "Bank of America", - "Goldman Sachs Int'l": "Goldman Sachs", - "Societe Generale": "Soc Gen", - "Barclays Bank PLC": "Barclays", - } - normalized = canonicalize_name(name) - return mappings.get(normalized, normalized) + return resolve_counterparty(name).canonical_name def normalize_clearing_house(name: str) -> str: """Normalize a clearing house name to the canonical historical workbook label.""" - mappings = { - "CME Clearing House": "CME", - "ICE Clear U.S.": "ICE", - "ICE Clear US": "ICE", - "ICE Clear Europe": "ICE Euro", - "EUREX Clearing": "EUREX", - "Japan Securities Clearing Corporation": "Japan SCC", - "Korea Exchange (in-house)": "Korea Exchange", - } - normalized = canonicalize_name(name) - return mappings.get(normalized, normalized) + normalized = _normalize_whitespace(name) + return _CLEARING_HOUSE_FALLBACK_MAPPINGS.get(normalized, normalized) diff --git a/src/counter_risk/pipeline/run.py b/src/counter_risk/pipeline/run.py index ad4c13a6..3b06a047 100644 --- a/src/counter_risk/pipeline/run.py +++ b/src/counter_risk/pipeline/run.py @@ -18,7 +18,7 @@ from counter_risk.config import WorkflowConfig, load_config from counter_risk.dates import derive_as_of_date, derive_run_date -from counter_risk.normalize import canonicalize_name, normalize_counterparty +from counter_risk.normalize import canonicalize_name, normalize_counterparty, resolve_counterparty from counter_risk.parsers import parse_fcm_totals, parse_futures_detail from counter_risk.pipeline.manifest import ManifestBuilder from counter_risk.pipeline.parsing_types import ( @@ -143,9 +143,10 @@ def reconcile_series_coverage( if value } ) - normalized_counterparties_in_data = _normalized_counterparties_from_parsed_data( - parsed_sections - ) + ( + normalized_counterparties_in_data, + counterparty_sources_by_raw_name, + ) = _counterparty_resolution_maps_from_records(totals_records) clearing_houses_in_data = sorted( { value @@ -229,7 +230,16 @@ def reconcile_series_coverage( raw_display = ", ".join(raw_names) warnings.append( "Reconciliation unmapped counterparty in sheet " - f"{sheet_name!r}: raw={raw_display!r}, normalized={normalized_name!r}" + f"{sheet_name!r}: raw={raw_display!r}, normalized={normalized_name!r}, " + "source=" + + ",".join( + sorted( + { + counterparty_sources_by_raw_name.get(raw_name, "unmapped") + for raw_name in raw_names + } + ) + ) ) for raw_name in raw_names: error = UnmappedCounterpartyError( @@ -362,16 +372,25 @@ def _extract_segments_from_records(parsed_sections: Mapping[str, Any]) -> set[st return segments -def _normalized_counterparties_from_records( +def _counterparty_resolution_maps_from_records( totals_records: list[dict[str, Any]], -) -> dict[str, set[str]]: +) -> tuple[dict[str, set[str]], dict[str, str]]: normalized_to_raw: dict[str, set[str]] = {} + sources_by_raw_name: dict[str, str] = {} for record in totals_records: raw_name = str(record.get("counterparty", "")).strip() if not raw_name: continue - normalized_name = normalize_counterparty(raw_name) - normalized_to_raw.setdefault(normalized_name, set()).add(raw_name) + resolution = resolve_counterparty(raw_name) + normalized_to_raw.setdefault(resolution.canonical_name, set()).add(raw_name) + sources_by_raw_name[raw_name] = resolution.source + return normalized_to_raw, sources_by_raw_name + + +def _normalized_counterparties_from_records( + totals_records: list[dict[str, Any]], +) -> dict[str, set[str]]: + normalized_to_raw, _ = _counterparty_resolution_maps_from_records(totals_records) return normalized_to_raw diff --git a/src/counter_risk/reports/__init__.py b/src/counter_risk/reports/__init__.py new file mode 100644 index 00000000..b9e195b7 --- /dev/null +++ b/src/counter_risk/reports/__init__.py @@ -0,0 +1 @@ +"""Report generation package.""" diff --git a/src/counter_risk/reports/mapping_diff.py b/src/counter_risk/reports/mapping_diff.py new file mode 100644 index 00000000..b75a5e13 --- /dev/null +++ b/src/counter_risk/reports/mapping_diff.py @@ -0,0 +1,147 @@ +"""Deterministic mapping diff report generator.""" + +from __future__ import annotations + +from collections.abc import Iterable, Iterator, Mapping +from pathlib import Path +from typing import Any + +from counter_risk.name_registry import load_name_registry +from counter_risk.normalize import resolve_counterparty + +_NORMALIZATION_NAME_KEYS = { + "counterparty", + "counterparty_name", + "name", + "raw_counterparty", + "raw_name", +} +_RECONCILIATION_NAME_KEYS = { + "counterparties_in_data", + "raw_counterparty_labels", +} + + +def _title_case_suggestion(raw_name: str) -> str: + return raw_name.title() + + +def _sorted_raw_names(values: Iterable[str]) -> list[str]: + """Sort names deterministically with case-insensitive primary ordering.""" + + return sorted(values, key=lambda raw_name: (raw_name.casefold(), raw_name)) + + +def _is_nonblank(value: str) -> bool: + return bool(value.strip()) + + +def _iter_string_values(value: Any) -> Iterator[str]: + if isinstance(value, str): + if _is_nonblank(value): + yield value + return + if isinstance(value, Mapping): + return + if isinstance(value, Iterable): + for item in value: + yield from _iter_string_values(item) + + +def _iter_names_from_payload( + value: Any, + *, + name_keys: set[str], + collect_strings: bool = False, +) -> Iterator[str]: + if isinstance(value, str): + if collect_strings and _is_nonblank(value): + yield value + return + + if isinstance(value, Mapping): + for raw_key, raw_child in value.items(): + key = str(raw_key).strip().casefold() + child_collect = collect_strings or key in name_keys + yield from _iter_names_from_payload( + raw_child, + name_keys=name_keys, + collect_strings=child_collect, + ) + return + + if isinstance(value, Iterable): + for child in value: + yield from _iter_names_from_payload( + child, + name_keys=name_keys, + collect_strings=collect_strings, + ) + + +def _iter_flat_string_sequence(payload: Any) -> Iterator[str]: + if isinstance(payload, (str, Mapping)): + return + if not isinstance(payload, Iterable): + return + + values = list(payload) + if not values or not all(isinstance(value, str) for value in values): + return + + for value in values: + if _is_nonblank(value): + yield value + + +def _iter_input_names(input_sources: Mapping[str, Any]) -> Iterable[str]: + for source_name in sorted(input_sources): + payload = input_sources[source_name] + source_key = str(source_name).strip().casefold() + if source_key == "normalization": + yield from _iter_flat_string_sequence(payload) + yield from _iter_names_from_payload(payload, name_keys=_NORMALIZATION_NAME_KEYS) + continue + if source_key == "reconciliation": + yield from _iter_flat_string_sequence(payload) + yield from _iter_names_from_payload(payload, name_keys=_RECONCILIATION_NAME_KEYS) + continue + + # Backward-compatible fallback for legacy callers that pass a flat list of names. + yield from _iter_string_values(payload) + + +def generate_mapping_diff_report( + registry_path: str | Path, + input_sources: Mapping[str, Any], +) -> str: + """Generate a deterministic mapping diff report.""" + + # Load once so missing/unreadable/invalid registry is treated as fatal for report generation. + load_name_registry(registry_path) + + unmapped_names: dict[str, None] = {} + fallback_mapped: dict[str, str] = {} + + for raw_name in _iter_input_names(input_sources): + result = resolve_counterparty(raw_name, registry_path=registry_path) + if result.source == "fallback": + fallback_mapped.setdefault(raw_name, result.canonical_name) + continue + if result.source == "unmapped": + unmapped_names.setdefault(raw_name, None) + + lines: list[str] = ["UNMAPPED"] + lines.extend(_sorted_raw_names(unmapped_names)) + lines.append("") + + lines.append("FALLBACK_MAPPED") + for raw_name in _sorted_raw_names(fallback_mapped): + lines.append(f"{raw_name} -> {fallback_mapped[raw_name]}") + lines.append("") + + lines.append("SUGGESTIONS") + for raw_name in _sorted_raw_names(unmapped_names): + lines.append(f"{raw_name} -> {_title_case_suggestion(raw_name)}") + + return "\n".join(lines) + "\n" diff --git a/tests/fixtures/name_registry_after.yml b/tests/fixtures/name_registry_after.yml new file mode 100644 index 00000000..c13c49c9 --- /dev/null +++ b/tests/fixtures/name_registry_after.yml @@ -0,0 +1,7 @@ +schema_version: 1 +entries: + - canonical_key: soc_gen_inc + display_name: Soc Gen Inc + aliases: + - Soc Gen Inc + - Societe Generale diff --git a/tests/fixtures/name_registry_before.yml b/tests/fixtures/name_registry_before.yml new file mode 100644 index 00000000..fee6e526 --- /dev/null +++ b/tests/fixtures/name_registry_before.yml @@ -0,0 +1,6 @@ +schema_version: 1 +entries: + - canonical_key: soc_gen_inc + display_name: Soc Gen Inc + aliases: + - Soc Gen Inc diff --git a/tests/pipeline/test_reconcile_series_coverage.py b/tests/pipeline/test_reconcile_series_coverage.py index 613eee90..47fa42a8 100644 --- a/tests/pipeline/test_reconcile_series_coverage.py +++ b/tests/pipeline/test_reconcile_series_coverage.py @@ -3,6 +3,7 @@ from __future__ import annotations from inspect import Parameter, signature +from pathlib import Path import pytest @@ -288,6 +289,70 @@ def test_reconcile_series_coverage_does_not_warn_when_raw_labels_normalize_to_he assert not any("unmapped counterparty" in warning for warning in result["warnings"]) +def test_reconcile_series_coverage_includes_fallback_source_in_unmapped_warning( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_dir = tmp_path / "config" + config_dir.mkdir(parents=True) + (config_dir / "name_registry.yml").write_text( + "\n".join( + [ + "schema_version: 1", + "entries:", + " - canonical_key: soc_gen_inc", + " display_name: Soc Gen Inc", + " aliases:", + " - Soc Gen Inc", + ] + ) + + "\n", + encoding="utf-8", + ) + monkeypatch.chdir(tmp_path) + + result = reconcile_series_coverage( + parsed_data_by_sheet={ + "Total": {"totals": [{"counterparty": "Societe Generale"}], "futures": []} + }, + historical_series_headers_by_sheet={"Total": ("Legacy Counterparty",)}, + ) + + assert any("source=fallback" in warning for warning in result["warnings"]) + + +def test_reconcile_series_coverage_includes_registry_source_in_unmapped_warning( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_dir = tmp_path / "config" + config_dir.mkdir(parents=True) + (config_dir / "name_registry.yml").write_text( + "\n".join( + [ + "schema_version: 1", + "entries:", + " - canonical_key: soc_gen_inc", + " display_name: Soc Gen Inc", + " aliases:", + " - Societe Generale", + ] + ) + + "\n", + encoding="utf-8", + ) + monkeypatch.chdir(tmp_path) + + result = reconcile_series_coverage( + parsed_data_by_sheet={ + "Total": {"totals": [{"counterparty": "Societe Generale"}], "futures": []} + }, + historical_series_headers_by_sheet={"Total": ("Legacy Counterparty",)}, + ) + + assert any("source=registry" in warning for warning in result["warnings"]) + + def test_normalized_counterparties_from_records_uses_normalization_mapping() -> None: totals_records = [ {"counterparty": "Bank of America, NA"}, diff --git a/tests/pipeline/test_run_pipeline.py b/tests/pipeline/test_run_pipeline.py index 069c3078..76293336 100644 --- a/tests/pipeline/test_run_pipeline.py +++ b/tests/pipeline/test_run_pipeline.py @@ -1485,10 +1485,10 @@ def __init__(self) -> None: self._slides = [_FakeSlide()] self.Count = 1 - def __iter__(self): + def __iter__(self) -> Any: return iter(self._slides) - def __getitem__(self, idx: int): # type: ignore[no-untyped-def] + def __getitem__(self, idx: int) -> Any: return self._slides[idx - 1] class _Presentation: diff --git a/tests/test_mapping_diff_report.py b/tests/test_mapping_diff_report.py new file mode 100644 index 00000000..2a906765 --- /dev/null +++ b/tests/test_mapping_diff_report.py @@ -0,0 +1,199 @@ +"""Unit tests for mapping diff report input scanning.""" + +from __future__ import annotations + +from pathlib import Path + +from counter_risk.reports.mapping_diff import generate_mapping_diff_report + + +def _write_registry(path: Path) -> None: + path.write_text( + "\n".join( + [ + "schema_version: 1", + "entries:", + " - canonical_key: bank_of_america", + " display_name: Bank of America", + " aliases:", + " - Bank of America", + ] + ) + + "\n", + encoding="utf-8", + ) + + +def test_generate_mapping_diff_report_scans_normalization_and_reconciliation_payloads( + tmp_path: Path, +) -> None: + registry_path = tmp_path / "name_registry.yml" + _write_registry(registry_path) + + report = generate_mapping_diff_report( + registry_path, + { + "normalization": [ + {"counterparty": "Societe Generale", "notional": 1.0}, + {"counterparty": "Unknown House"}, + ], + "reconciliation": { + "by_sheet": { + "Total": { + "counterparties_in_data": [ + "Bank of America, NA", + "Unknown House", + "Citigroup", + ], + "normalized_counterparties_in_data": [ + "Bank of America", + "Unknown House", + "Citibank", + ], + } + } + }, + }, + ) + + assert "UNMAPPED\nUnknown House\n" in report + assert "FALLBACK_MAPPED\n" in report + assert "Bank of America, NA -> Bank of America\n" in report + assert "Citigroup -> Citibank\n" in report + assert "Societe Generale -> Soc Gen\n" in report + assert "SUGGESTIONS\nUnknown House -> Unknown House\n" in report + + +def test_generate_mapping_diff_report_ignores_non_name_string_fields(tmp_path: Path) -> None: + registry_path = tmp_path / "name_registry.yml" + _write_registry(registry_path) + + report = generate_mapping_diff_report( + registry_path, + { + "normalization": { + "metadata": {"run_id": "run-123"}, + "rows": [{"counterparty": "Societe Generale", "segment": "swaps"}], + }, + "reconciliation": {"warnings": ["raw='Societe Generale'"]}, + }, + ) + + assert "run-123" not in report + assert "raw='Societe Generale'" not in report + assert "Societe Generale -> Soc Gen\n" in report + + +def test_generate_mapping_diff_report_preserves_raw_names(tmp_path: Path) -> None: + registry_path = tmp_path / "name_registry.yml" + _write_registry(registry_path) + + report = generate_mapping_diff_report( + registry_path, + { + "normalization": [ + {"counterparty": " Unknown House "}, + {"counterparty": " "}, + ], + }, + ) + + assert "UNMAPPED\n Unknown House \n" in report + assert "Unknown House\n" not in report + + +def test_generate_mapping_diff_report_fallback_section_is_deterministic(tmp_path: Path) -> None: + registry_path = tmp_path / "name_registry.yml" + _write_registry(registry_path) + + report = generate_mapping_diff_report( + registry_path, + { + "normalization": [ + {"counterparty": "Citigroup"}, + {"counterparty": "Bank of America, NA"}, + {"counterparty": "Societe Generale"}, + ], + "reconciliation": { + "counterparties_in_data": [ + "Societe Generale", + "Citigroup", + "Bank of America, NA", + ] + }, + }, + ) + + expected_section = "\n".join( + [ + "FALLBACK_MAPPED", + "Bank of America, NA -> Bank of America", + "Citigroup -> Citibank", + "Societe Generale -> Soc Gen", + "", + ] + ) + assert expected_section in report + + +def test_generate_mapping_diff_report_suggestions_are_deterministic_title_case( + tmp_path: Path, +) -> None: + registry_path = tmp_path / "name_registry.yml" + _write_registry(registry_path) + + report = generate_mapping_diff_report( + registry_path, + { + "normalization": [ + {"counterparty": "aaa holdings"}, + {"counterparty": "zeta llc"}, + {"counterparty": "aaa holdings"}, + ], + "reconciliation": { + "counterparties_in_data": [ + "zeta llc", + "aaa holdings", + ] + }, + }, + ) + + expected_section = "\n".join( + [ + "SUGGESTIONS", + "aaa holdings -> Aaa Holdings", + "zeta llc -> Zeta Llc", + "", + ] + ) + assert expected_section in report + + +def test_generate_mapping_diff_report_sections_use_required_line_formats(tmp_path: Path) -> None: + registry_path = tmp_path / "name_registry.yml" + _write_registry(registry_path) + + report = generate_mapping_diff_report( + registry_path, + { + "normalization": [ + {"counterparty": "UNKNOWN broker"}, + {"counterparty": "Citigroup"}, + ], + "reconciliation": {"counterparties_in_data": ["UNKNOWN broker"]}, + }, + ) + + lines = report.splitlines() + unmapped_start = lines.index("UNMAPPED") + fallback_start = lines.index("FALLBACK_MAPPED") + suggestions_start = lines.index("SUGGESTIONS") + + unmapped_lines = lines[unmapped_start + 1 : fallback_start - 1] + fallback_lines = lines[fallback_start + 1 : suggestions_start - 1] + suggestion_lines = lines[suggestions_start + 1 :] + + assert unmapped_lines == ["UNKNOWN broker"] + assert fallback_lines == ["Citigroup -> Citibank"] + assert suggestion_lines == ["UNKNOWN broker -> Unknown Broker"] diff --git a/tests/test_mapping_diff_report_cli.py b/tests/test_mapping_diff_report_cli.py new file mode 100644 index 00000000..71d8c239 --- /dev/null +++ b/tests/test_mapping_diff_report_cli.py @@ -0,0 +1,121 @@ +"""Tests for mapping_diff_report CLI behavior.""" + +from __future__ import annotations + +import os +import stat +import subprocess +import sys +from pathlib import Path + + +def _cli_cmd() -> list[str]: + return [sys.executable, "-m", "counter_risk.cli.mapping_diff_report"] + + +def _cli_env() -> dict[str, str]: + env = os.environ.copy() + src_path = str(Path("src").resolve()) + env["PYTHONPATH"] = ( + src_path if "PYTHONPATH" not in env else f"{src_path}{os.pathsep}{env['PYTHONPATH']}" + ) + return env + + +def test_mapping_diff_report_help_exits_zero() -> None: + result = subprocess.run( + [*_cli_cmd(), "--help"], + check=False, + capture_output=True, + text=True, + env=_cli_env(), + ) + assert result.returncode == 0 + assert "mapping_diff_report" in result.stdout + + +def test_mapping_diff_report_missing_registry_exits_nonzero(tmp_path: Path) -> None: + missing_registry = tmp_path / "missing_registry.yml" + result = subprocess.run( + [*_cli_cmd(), "--registry", str(missing_registry)], + check=False, + capture_output=True, + text=True, + env=_cli_env(), + ) + assert result.returncode != 0 + assert str(missing_registry) in result.stderr + assert len(result.stderr.strip().splitlines()) == 1 + + +def test_mapping_diff_report_default_registry_missing_mentions_config_path(tmp_path: Path) -> None: + result = subprocess.run( + _cli_cmd(), + check=False, + capture_output=True, + text=True, + env=_cli_env(), + cwd=tmp_path, + ) + assert result.returncode != 0 + assert "config/name_registry.yml" in result.stderr + assert len(result.stderr.strip().splitlines()) == 1 + + +def test_mapping_diff_report_unreadable_registry_exits_nonzero(tmp_path: Path) -> None: + registry_path = tmp_path / "name_registry.yml" + registry_path.write_text("schema_version: 1\nentries: []\n", encoding="utf-8") + registry_path.chmod(0) + try: + result = subprocess.run( + [*_cli_cmd(), "--registry", str(registry_path)], + check=False, + capture_output=True, + text=True, + env=_cli_env(), + ) + finally: + registry_path.chmod(stat.S_IRUSR | stat.S_IWUSR) + + assert result.returncode != 0 + assert str(registry_path) in result.stderr + assert len(result.stderr.strip().splitlines()) == 1 + + +def test_mapping_diff_report_deterministic_sections(tmp_path: Path) -> None: + registry_path = tmp_path / "name_registry.yml" + registry_path.write_text( + "\n".join( + [ + "schema_version: 1", + "entries:", + " - canonical_key: bank_of_america", + " display_name: Bank of America", + " aliases:", + " - Bank of America", + ] + ) + + "\n", + encoding="utf-8", + ) + + args = [ + *_cli_cmd(), + "--registry", + str(registry_path), + "--normalization-name", + "Societe Generale", + "--normalization-name", + "Unknown House", + "--reconciliation-name", + "Unknown House", + ] + first = subprocess.run(args, check=False, capture_output=True, text=True, env=_cli_env()) + second = subprocess.run(args, check=False, capture_output=True, text=True, env=_cli_env()) + + assert first.returncode == 0 + assert second.returncode == 0 + assert first.stdout == second.stdout + assert "UNMAPPED\nUnknown House\n" in first.stdout + assert "FALLBACK_MAPPED\nSociete Generale -> Soc Gen\n" in first.stdout + assert "SUGGESTIONS\nUnknown House -> Unknown House\n" in first.stdout diff --git a/tests/test_normalization_registry_first.py b/tests/test_normalization_registry_first.py new file mode 100644 index 00000000..e9b8ade6 --- /dev/null +++ b/tests/test_normalization_registry_first.py @@ -0,0 +1,111 @@ +"""Integration tests for registry-first normalization and mapping diff output.""" + +from __future__ import annotations + +import logging +import shutil +from pathlib import Path + +import pytest + +from counter_risk.normalize import resolve_counterparty +from counter_risk.pipeline.run import reconcile_series_coverage +from counter_risk.reports.mapping_diff import generate_mapping_diff_report + + +def _fixture_path(name: str) -> Path: + return Path("tests/fixtures") / name + + +def _input_sources() -> dict[str, object]: + return { + "normalization": [{"counterparty": "Societe Generale"}], + "reconciliation": {"counterparties_in_data": ["Societe Generale"]}, + } + + +def test_mapping_diff_report_before_registry_alias_uses_fallback_section() -> None: + report = generate_mapping_diff_report( + _fixture_path("name_registry_before.yml"), _input_sources() + ) + + assert "FALLBACK_MAPPED\nSociete Generale -> Soc Gen\n" in report + + +def test_mapping_diff_report_after_registry_alias_removes_fallback_and_unmapped_entries() -> None: + report = generate_mapping_diff_report( + _fixture_path("name_registry_after.yml"), _input_sources() + ) + + assert "Societe Generale -> Soc Gen\n" not in report + assert "UNMAPPED\nSociete Generale\n" not in report + assert "SUGGESTIONS\nSociete Generale -> Societe Generale\n" not in report + + +def test_mapping_diff_report_changes_between_before_and_after_registry_states() -> None: + before_report = generate_mapping_diff_report( + _fixture_path("name_registry_before.yml"), + _input_sources(), + ) + after_report = generate_mapping_diff_report( + _fixture_path("name_registry_after.yml"), + _input_sources(), + ) + + assert before_report != after_report + assert "Societe Generale -> Soc Gen\n" in before_report + assert "Societe Generale -> Soc Gen\n" not in after_report + + +def test_resolve_counterparty_uses_registry_direct_canonical_match_before_fallback( + tmp_path: Path, +) -> None: + registry_path = tmp_path / "name_registry.yml" + registry_path.write_text( + "\n".join( + [ + "schema_version: 1", + "entries:", + " - canonical_key: soc_gen", + " display_name: Soc Gen", + " aliases:", + " - SG", + ] + ) + + "\n", + encoding="utf-8", + ) + + display_name_match = resolve_counterparty("Soc Gen", registry_path=registry_path) + canonical_key_match = resolve_counterparty("soc_gen", registry_path=registry_path) + + assert display_name_match.canonical_name == "Soc Gen" + assert display_name_match.source == "registry" + assert canonical_key_match.canonical_name == "Soc Gen" + assert canonical_key_match.source == "registry" + + +def test_reconciliation_with_after_registry_has_no_societe_generale_warning( + caplog: pytest.LogCaptureFixture, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_dir = tmp_path / "config" + config_dir.mkdir(parents=True) + shutil.copyfile( + _fixture_path("name_registry_after.yml"), + config_dir / "name_registry.yml", + ) + monkeypatch.chdir(tmp_path) + caplog.set_level(logging.WARNING) + + result = reconcile_series_coverage( + parsed_data_by_sheet={ + "Total": {"totals": [{"counterparty": "Societe Generale"}], "futures": []} + }, + historical_series_headers_by_sheet={"Total": ("Soc Gen Inc", "Legacy Counterparty")}, + ) + + assert result["warnings"] + assert not any("Societe Generale" in warning for warning in result["warnings"]) + assert all("Societe Generale" not in record.getMessage() for record in caplog.records) diff --git a/tests/test_release_spec.py b/tests/test_release_spec.py index 46bce21c..b8f3e46c 100644 --- a/tests/test_release_spec.py +++ b/tests/test_release_spec.py @@ -54,7 +54,7 @@ def _collect(*args: object, **kwargs: object) -> str: analysis_scripts = captures["analysis_args"][0] assert len(analysis_scripts) == 1 - assert analysis_scripts[0].endswith("src/counter_risk/cli.py") + assert analysis_scripts[0].endswith("src/counter_risk/cli/__main__.py") runtime_hooks = captures["analysis_kwargs"]["runtime_hooks"] assert len(runtime_hooks) == 1