#!/usr/bin/env python3
"""QA checks for standalone HTML manuscripts."""
from __future__ import annotations

import argparse
import json
import os
import shutil
import subprocess
import sys
from dataclasses import dataclass
from html.parser import HTMLParser
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Set, Tuple

ROOT = Path(__file__).resolve().parents[1]
STANDALONE_DIR = ROOT / "omega_document" / "admin_portal" / "static" / "html" / "standalone"
TMP_DIR = ROOT / "tmp" / "standalone_qa"

BROWSER_CANDIDATES = [
    "chromium-browser",
    "chromium",
    "google-chrome-stable",
    "google-chrome",
    "chrome",
]


@dataclass
class HtmlCheckResult:
    path: Path
    tidy_ok: bool
    tidy_output: str
    link_issues: List[str]
    missing_ids: List[str]
    pdf_path: Optional[Path]
    pdf_generated: bool
    pdf_message: Optional[str]


class LinkCollector(HTMLParser):
    def __init__(self) -> None:
        super().__init__(convert_charrefs=True)
        self.links: List[Tuple[str, str]] = []
        self.resources: List[Tuple[str, str]] = []
        self.fragment_hrefs: List[str] = []
        self.ids: Set[str] = set()

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
        attr_map: Dict[str, str] = {name: value for (name, value) in attrs if value is not None}
        element_id = attr_map.get("id")
        if element_id:
            self.ids.add(element_id)
        if tag == "a":
            href = attr_map.get("href")
            if href:
                if href.startswith("#"):
                    self.fragment_hrefs.append(href[1:])
                elif not href.startswith("http") and not href.startswith("mailto:"):
                    self.links.append((tag, href))
        if tag in {"img", "script"}:
            src = attr_map.get("src")
            if src and not src.startswith("http") and not src.startswith("data:"):
                self.resources.append((tag, src))
        if tag == "link":
            href = attr_map.get("href")
            if href and not href.startswith("http"):
                self.resources.append((tag, href))


def run_tidy(html_path: Path) -> Tuple[bool, str]:
    if not shutil.which("tidy"):
        return True, "tidy executable not found; skipped"
    try:
        completed = subprocess.run(
            ["tidy", "-quiet", "-errors", "-utf8", str(html_path)],
            capture_output=True,
            text=True,
            check=False,
        )
    except OSError as exc:
        return False, f"Failed to invoke tidy: {exc}"
    stdout = completed.stdout.strip()
    stderr = completed.stderr.strip()
    ok = completed.returncode in (0, 1)
    combined = "\n".join(filter(None, [stdout, stderr]))
    return ok, combined


def resolve_local_path(base: Path, reference: str) -> Path:
    if reference.startswith("/"):
        reference = reference.lstrip("/")
        if reference.startswith("static/"):
            reference = reference[len("static/") :]
        return ROOT / "omega_document" / "admin_portal" / "static" / reference
    return (base.parent / reference).resolve()


def check_links(html_path: Path) -> Tuple[List[str], List[str]]:
    collector = LinkCollector()
    collector.feed(html_path.read_text(encoding="utf-8"))
    link_issues: List[str] = []
    for tag, href in collector.links + collector.resources:
        if href.startswith("/") and not href.startswith("/static/"):
            continue
        target_path = resolve_local_path(html_path, href)
        if not target_path.exists():
            link_issues.append(f"Missing resource for {tag} -> {href} (expected {target_path})")
    missing_ids = [frag for frag in collector.fragment_hrefs if frag not in collector.ids]
    return link_issues, missing_ids


def find_browser() -> Optional[str]:
    for candidate in BROWSER_CANDIDATES:
        if shutil.which(candidate):
            return candidate
    return None


def generate_pdf(browser: str, html_path: Path, slug: str) -> Tuple[bool, Optional[Path], Optional[str]]:
    TMP_DIR.mkdir(parents=True, exist_ok=True)
    pdf_path = TMP_DIR / f"{slug}.pdf"
    if pdf_path.exists():
        pdf_path.unlink()
    url = f"file://{html_path}"
    cmd = [
        browser,
        "--headless",
        "--disable-gpu",
        f"--print-to-pdf={pdf_path}",
        url,
    ]
    try:
        completed = subprocess.run(cmd, capture_output=True, text=True, check=False)
    except OSError as exc:
        return False, None, f"Failed to run {browser}: {exc}"
    if completed.returncode != 0:
        message = (completed.stderr or completed.stdout).strip()
        return False, None, f"Browser exited with {completed.returncode}: {message}".strip()
    if not pdf_path.exists() or pdf_path.stat().st_size == 0:
        return False, pdf_path, "PDF not generated or empty"
    return True, pdf_path, None


def run_checks(files: Iterable[Path]) -> List[HtmlCheckResult]:
    browser = find_browser()
    results: List[HtmlCheckResult] = []
    for html_path in files:
        tidy_ok, tidy_output = run_tidy(html_path)
        link_issues, missing_ids = check_links(html_path)
        pdf_ok = False
        pdf_path: Optional[Path] = None
        pdf_message: Optional[str] = None
        if browser:
            pdf_ok, pdf_path, pdf_message = generate_pdf(browser, html_path, html_path.stem)
        else:
            pdf_message = "Skipped (no headless Chrome/Chromium detected)"
        results.append(
            HtmlCheckResult(
                path=html_path,
                tidy_ok=tidy_ok,
                tidy_output=tidy_output,
                link_issues=link_issues,
                missing_ids=missing_ids,
                pdf_path=pdf_path,
                pdf_generated=pdf_ok,
                pdf_message=pdf_message,
            )
        )
    return results


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON summary")
    args = parser.parse_args()

    if not STANDALONE_DIR.exists():
        print(f"Standalone directory not found: {STANDALONE_DIR}", file=sys.stderr)
        return 1

    html_files = sorted(STANDALONE_DIR.glob("*.html"))
    if not html_files:
        print("No standalone HTML files to check.")
        return 0

    results = run_checks(html_files)

    if args.json:
        payload = [
            {
                "path": str(result.path.relative_to(ROOT)),
                "tidy_ok": result.tidy_ok,
                "tidy_output": result.tidy_output,
                "link_issues": result.link_issues,
                "missing_fragment_ids": result.missing_ids,
                "pdf_generated": result.pdf_generated,
                "pdf_path": str(result.pdf_path) if result.pdf_path else None,
                "pdf_message": result.pdf_message,
            }
            for result in results
        ]
        print(json.dumps(payload, indent=2))
        return 0

    for result in results:
        print(f"\n=== {result.path.name} ===")
        print(f"tidy: {'ok' if result.tidy_ok else 'issues detected'}")
        if result.tidy_output and result.tidy_output != "tidy executable not found; skipped":
            print(result.tidy_output)
        if result.link_issues:
            print("Broken references:")
            for issue in result.link_issues:
                print(f"  - {issue}")
        if result.missing_ids:
            print("Missing in-page anchors:")
            for fragment in result.missing_ids:
                print(f"  - #{fragment}")
        if result.pdf_generated:
            size = result.pdf_path.stat().st_size if result.pdf_path else 0
            print(f"print-to-pdf: ok ({size} bytes)")
        else:
            print(f"print-to-pdf: skipped - {result.pdf_message}")
    return 0


if __name__ == "__main__":
    sys.exit(main())
