tests/docs/test_complete_examples.py

import csv
import typing
import unittest.mock
import uuid
import warnings
from collections.abc import Generator, Iterator
from dataclasses import dataclass, field
from pathlib import Path

import pytest

import ehrql.main


@dataclass
class DatasetDefinitionExample:
    """Store details of a complete ehrQL dataset definition example."""

    path: Path

    start_line: int
    end_line: int | None = None

    content_lines: list[str] = field(default_factory=list)

    def relative_path(self) -> Path:
        """Return the relative path of the dataset definition source file
        to the source code root."""
        source_code_path = Path(__file__).parents[2]
        return self.path.relative_to(source_code_path)


def discover_paths(glob_string: str) -> Generator[Path, None, None]:
    """Generate a list of matching files for a glob in the documentation source path."""
    docs_path = Path(__file__).parents[2] / "docs"
    return docs_path.glob(glob_string)


def find_complete_ehrql_examples_in_markdown(
    file: typing.TextIO,
) -> Iterator[DatasetDefinitionExample]:
    """Yields extracted code blocks labelled as ```ehrql from a Markdown file.

    Incomplete ehrQL dataset definitions should be labelled as ```python,
    and not with ```ehrql."""
    # There are three kinds of possible ehrql code blocks that we might see.
    # 1. A complete and correctly formatted example like:
    #
    # ```ehrql
    # …
    # ```
    #
    # 2. An incorrectly formatted example where we see another open code block
    #    before a closed one.
    #
    # ```ehrql
    # …
    # ```something
    #
    # 3. An incorrectly formatted example where the code block is not closed
    #    before the end of file.
    #
    # ```ehrql
    # …
    # EOF
    in_ehrql_block = False

    for line_number, line in enumerate(file, start=1):
        # Check if the line marks the start of a ehrQL code block?
        if line.startswith("```ehrql") or line.startswith("``` ehrql"):
            if in_ehrql_block:
                # TODO: Should this fail completely?
                # NB: This drops the incomplete block entirely.
                warnings.warn(
                    f"In {file.name}, a new ehrQL code block is opened on line {line_number} before the previous one has been closed."
                )
            in_ehrql_block = True
            example = DatasetDefinitionExample(
                path=Path(file.name),
                start_line=line_number + 1,
            )
            continue

        # If we are not in a ehrQL code block,
        # and the line does not mark the start of an ehrQL code block,
        # we can ignore the line.
        if not in_ehrql_block:
            continue

        # This marks the end of the current ehrQL code block,
        # so store the example and look for further examples.
        # Whitespace following the code block markers is permitted,
        # and we need to strip the newline to match.
        if line.rstrip() == "```":
            in_ehrql_block = False
            example.end_line = line_number - 1
            yield example
            continue

        # If we are in an ehrQL code block,
        # and the current line does not close it,
        # append the line to the example.
        example.content_lines.append(line)

    if in_ehrql_block:
        # TODO: Should this fail completely?
        # NB: Incomplete examples are dropped.
        warnings.warn(
            f"In {example.relative_path}, an ehrQL code block opened on line {example.start_line - 1} is not closed."
        )


def generate_complete_ehrql_examples() -> (
    Generator[DatasetDefinitionExample, None, None]
):
    """Yields all complete ehrQL DatasetDefinitionExamples from the Markdown documentation."""
    markdown_paths = list(discover_paths("**/*.md"))
    assert len(markdown_paths) > 0, "No Markdown files found"

    for p in markdown_paths:
        with open(p) as f:
            yield from find_complete_ehrql_examples_in_markdown(f)

    dataset_definition_source_paths = list(discover_paths("**/*.py"))
    assert len(dataset_definition_source_paths) > 0, "No .py files found"

    for p in dataset_definition_source_paths:
        with open(p) as f:
            content_lines = f.readlines()
            number_of_lines = len(content_lines)
            assert number_of_lines > 0
            yield DatasetDefinitionExample(
                path=Path(f.name),
                start_line=1,
                end_line=number_of_lines,
                content_lines=content_lines,
            )


def create_example_test_case_id(example: DatasetDefinitionExample) -> str:
    """Returns a test case ID for pytest from a specific DatasetDefinitionExample."""
    return f"{example.relative_path()} L{example.start_line}:L{example.end_line}"


@pytest.mark.parametrize(
    "example",
    generate_complete_ehrql_examples(),
    ids=create_example_test_case_id,
)
def test_ehrql_generate_dataset_example(
    tmp_path: Path, example: DatasetDefinitionExample
) -> None:
    """Test that an ehrQL generate dataset example works."""
    tmp_filename_base = str(uuid.uuid4())

    dataset_definition_content = "\n".join(example.content_lines)
    tmp_dataset_definition_path = tmp_path / (tmp_filename_base + ".py")
    tmp_dataset_definition_path.write_text(dataset_definition_content)

    tmp_dataset_path = tmp_path / (tmp_filename_base + ".csv")

    # Monkeypatch getting a CSV codelist because:
    # * inline code blocks don't have any easy way of providing a separate file
    # * there is no codelist code that satisfies constraints of all code systems,
    #   so patch out the validity check and just pass in a fake codelist
    with (
        unittest.mock.patch(
            "ehrql.codelist_from_csv",
            return_value={"not_a_real_code!": "not_a_real_category!"},
        ),
        unittest.mock.patch(
            "ehrql.codes.BaseCode.__post_init__",
            return_value=None,
        ),
    ):
        # No name needed to store a value:
        # the output CSV gets written to a temporary file.
        ehrql.main.generate_dataset(tmp_dataset_definition_path, tmp_dataset_path)

    with open(tmp_dataset_path) as f:
        csv_content = f.readlines()

    # If the dataset definition works, we should have a valid CSV.
    assert len(csv_content) > 0, "CSV is empty for example {example}"

    # Check we can read the CSV content.
    csv_reader = csv.DictReader(csv_content)
    for row in csv_reader:
        pass