"""Build the synthetic E2E fixture PDF at ``tests/fixtures/synthetic_giro.pdf``. Re-runnable on demand. Output bytes are stable across runs in page content, layout, and text — only the PDF's embedded timestamps change, which pipeline tests don't read. The committed fixture is what CI consumes; re-run this script locally if you change the ground truth. Contents: one A4 portrait page with six known strings placed at fixed positions near the top. The goal is reproducible ground truth, not a realistic bank statement. The pipeline's fake OCR client is seeded with those same strings (at plausible bboxes) so the E2E test can assert exact matches. Usage:: uv run python scripts/create_fixture_pdf.py """ from __future__ import annotations from pathlib import Path import fitz # PyMuPDF OUT_PATH = ( Path(__file__).resolve().parent.parent / "tests" / "fixtures" / "synthetic_giro.pdf" ) LINES: list[str] = [ "DKB", "IBAN: DE89370400440532013000", "Statement period: 01.03.2026 - 31.03.2026", "Opening balance: 1234.56 EUR", "Closing balance: 1450.22 EUR", "Statement date: 31.03.2026", ] def build() -> None: doc = fitz.open() # A4 @ 72 dpi -> 595 x 842 points. page = doc.new_page(width=595, height=842) y = 72.0 for line in LINES: page.insert_text( (72.0, y), line, fontsize=12, fontname="helv", ) y += 24.0 OUT_PATH.parent.mkdir(parents=True, exist_ok=True) # deflate=False + garbage=0 keeps the output byte-stable. doc.save( str(OUT_PATH), deflate=False, deflate_images=False, garbage=0, clean=False, ) doc.close() if __name__ == "__main__": build() print(f"wrote {OUT_PATH}")