#!/usr/bin/env python3
"""Build standalone HTML pages for RFT manuscripts.

The standalone HTML artifacts are now treated as the canonical manuscripts.
To avoid clobbering hand edits, regeneration from the legacy body fragments is
gated behind an explicit ``--regen`` flag.
"""
from __future__ import annotations

import argparse
import hashlib
import json
import re
import sys
from dataclasses import dataclass
from datetime import datetime, timezone
from html import escape
from pathlib import Path
from typing import Dict, Iterable, List, Tuple

ROOT = Path(__file__).resolve().parents[1]
STATIC_ROOT = ROOT / "omega_document" / "admin_portal" / "static"
SOURCE_HTML_DIR = STATIC_ROOT / "html"
DEST_HTML_DIR = SOURCE_HTML_DIR / "standalone"
ASSET_ROOT = STATIC_ROOT / "standalone_assets"

STYLESHEET_HREF = "/static/css/standalone.css"
STANDALONE_JS_SRC = "/static/js/standalone.js"
MATHJAX_CONFIG_SRC = "/static/js/mathjax.js"
MATHJAX_BUNDLE_SRC = "/static/vendor/mathjax/es5/tex-chtml-full.js"

TOC_LEVEL_LIMIT = 3


def build_asset_lookup() -> Dict[str, Path]:
    index: Dict[str, Path] = {}
    for path in STATIC_ROOT.rglob("*"):
        if not path.is_file():
            continue
        if "standalone_assets" in path.parts:
            continue
        if ("html" in path.parts and "standalone" in path.parts):
            continue
        key_exact = path.name.lower()
        index.setdefault(key_exact, path)
        sanitized = re.sub(r"[^a-z0-9]+", "", path.stem.lower()) + path.suffix.lower()
        index.setdefault(sanitized, path)
    return index


ASSET_INDEX = build_asset_lookup()


def slugify(value: str) -> str:
    value = value.strip().lower()
    # Replace non-alphanumeric with hyphen
    value = re.sub(r"[^a-z0-9]+", "-", value)
    value = re.sub(r"-+", "-", value)
    return value.strip("-") or "section"


@dataclass
class Manuscript:
    document_key: str
    slug: str
    title: str
    description: str
    source_html: Path
    has_numbered_headings: bool = True


MANUSCRIPTS: List[Manuscript] = [
    Manuscript(
        document_key="omega_document",
        slug="omega-document",
        title="Omega Document: Unified Scalaron–Twistor Framework",
        description="Foundational Omega overview stitching the scalaron–twistor program into a single manuscript.",
        source_html=SOURCE_HTML_DIR / "omega_document_body.html",
    ),
    Manuscript(
        document_key="rft_volume1_final",
        slug="rft-volume1-final",
        title="RFT Volume 1: Emergent Spacetime (Final)",
        description="Camera-ready release candidate with RC bundle stamp.",
        source_html=SOURCE_HTML_DIR / "volume1_body.html",
        has_numbered_headings=False,
    ),
    Manuscript(
        document_key="rft_volume1_rc11",
        slug="rft-volume1-rc11",
        title="RFT Volume 1: Unified Field Dynamics (RC11 Preprint)",
        description="v1.0-rc11 accessibility build with executive summary and preregistered gate overview.",
        source_html=SOURCE_HTML_DIR / "volume1_rc11_body.html",
    ),
    Manuscript(
        document_key="rft_volume2_rc12",
        slug="rft-volume2-rc12",
        title="RFT Volume 2: Geometric Necessity (RC12)",
        description="RC12 constants and embedding closure updates with deterministic CI guardrails.",
        source_html=SOURCE_HTML_DIR / "volume2_body.html",
    ),
]


def strip_html_tags(value: str) -> str:
    text = re.sub(r"<[^>]+>", " ", value)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


Heading = Tuple[int, str, str]


def extract_headings(html: str) -> List[Heading]:
    pattern = re.compile(r"<h(?P<level>[1-6])([^>]*)>(.*?)</h(?P=level)>", re.S | re.I)
    headings: List[Heading] = []
    for match in pattern.finditer(html):
        level = int(match.group("level"))
        attrs = match.group(2)
        inner_html = match.group(3)
        id_match = re.search(r'id="([^"]+)"', attrs, re.I)
        if not id_match:
            continue
        heading_id = id_match.group(1)
        text = strip_html_tags(inner_html)
        if not text:
            continue
        if level > TOC_LEVEL_LIMIT + 1:
            continue
        headings.append((level, heading_id, text))
    return headings


def nest_headings(headings: Iterable[Heading]) -> List[Dict[str, object]]:
    """Transform a flat heading list into a nested tree."""
    root: Dict[str, object] = {"level": 0, "children": []}
    stack: List[Dict[str, object]] = [root]
    for level, heading_id, text in headings:
        node = {"level": level, "id": heading_id, "text": text, "children": []}
        while stack and level <= stack[-1]["level"]:
            stack.pop()
        stack[-1]["children"].append(node)
        stack.append(node)
    return root["children"]


def render_toc(nodes: List[Dict[str, object]], depth: int = 0) -> str:
    if not nodes:
        return ""
    classes = "toc-list" if depth == 0 else "toc-sublist"
    parts = [f'<ol class="{classes}">']
    for node in nodes:
        parts.append(
            '<li><a href="#{id}">{text}</a>'.format(
                id=node["id"], text=escape(node["text"])
            )
        )
        child_html = render_toc(node["children"], depth + 1)
        if child_html:
            parts.append(child_html)
        parts.append("</li>")
    parts.append("</ol>")
    return "".join(parts)


SECTION_STRONG_PATTERN = re.compile(r"<p><strong>(.+?)</strong></p>", re.S)
NUMBERED_HEADING_PATTERN = re.compile(r"^(\d+(?:\.\d+)*)[\s.)]*(.*)$", re.S)


def normalize_volume1_headings(html: str) -> str:
    has_title = False
    existing_ids: set[str] = set()

    def ensure_unique(slug: str) -> str:
        candidate = slug
        counter = 2
        while candidate in existing_ids:
            candidate = f"{slug}-{counter}"
            counter += 1
        existing_ids.add(candidate)
        return candidate

    def replacement(match: re.Match[str]) -> str:
        nonlocal has_title
        raw = match.group(1).strip()
        flattened = re.sub(r"\s+", " ", raw).strip()
        if not flattened:
            return match.group(0)
        if not has_title and not flattened[0].isdigit():
            has_title = True
            slug = ensure_unique(slugify(flattened))
            return f'<h1 id="{slug}">{flattened}</h1>'
        number_match = NUMBERED_HEADING_PATTERN.match(flattened)
        if not number_match:
            return match.group(0)
        label = number_match.group(1).strip(".")
        rest = number_match.group(2).strip()
        heading_text = f"{label} {rest}".strip()
        depth = max(1, min(5, label.count(".") + 1))
        tag = {1: "h2", 2: "h3", 3: "h4", 4: "h5", 5: "h6"}[depth]
        slug = ensure_unique(slugify(heading_text))
        return f'<{tag} id="{slug}">{heading_text}</{tag}>'

    transformed = SECTION_STRONG_PATTERN.sub(replacement, html)
    return transformed


IMG_PATTERN = re.compile(r"(<img[^>]+src=\")([^\"]+)(\"[^>]*>)", re.IGNORECASE)


def copy_assets(slug: str, html: str) -> Tuple[str, List[Dict[str, str]]]:
    asset_dir = ASSET_ROOT / slug
    asset_dir.mkdir(parents=True, exist_ok=True)
    manifest: List[Dict[str, str]] = []
    name_registry: Dict[str, str] = {}
    used_names: Dict[str, Path] = {}

    def repl(match: re.Match[str]) -> str:
        prefix, src, suffix = match.groups()
        original_src = src
        if src.startswith("http://") or src.startswith("https://"):
            # External images are not expected; leave unchanged but record.
            manifest.append({
                "original": original_src,
                "copied": original_src,
                "note": "external-reference",
            })
            return match.group(0)
        clean_src = src.lstrip("/")
        if clean_src.startswith("static/"):
            clean_src = clean_src[len("static/") :]
        source_path = STATIC_ROOT / clean_src
        resolved_from = clean_src
        if not source_path.exists():
            filename = Path(clean_src).name
            sanitized_key = re.sub(r"[^a-z0-9]+", "", Path(clean_src).stem.lower()) + Path(clean_src).suffix.lower()
            candidate_path = ASSET_INDEX.get(filename.lower()) or ASSET_INDEX.get(sanitized_key)
            if candidate_path:
                source_path = candidate_path
                resolved_from = str(source_path.relative_to(STATIC_ROOT))
            else:
                placeholder = (
                    f'<div class="missing-asset" data-missing="{escape(original_src)}">'
                    f"Missing asset: {escape(original_src)}</div>"
                )
                manifest.append(
                    {
                        "original": original_src,
                        "copied": None,
                        "note": "missing",
                    }
                )
                print(
                    f"[warn] Missing asset for {slug}: {original_src}",
                    file=sys.stderr,
                )
                return placeholder
        if clean_src in name_registry:
            candidate = name_registry[clean_src]
        else:
            name = source_path.name
            stem = source_path.stem
            suffix_ext = source_path.suffix
            candidate = name
            counter = 2
            while candidate in used_names:
                candidate = f"{stem}-{counter}{suffix_ext}"
                counter += 1
            name_registry[clean_src] = candidate
        dest_path = asset_dir / candidate
        if not dest_path.exists():
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            dest_path.write_bytes(source_path.read_bytes())
        used_names[candidate] = dest_path
        digest = hashlib.sha256(dest_path.read_bytes()).hexdigest()
        entry = {
            "original": original_src,
            "copied": f"standalone_assets/{slug}/{candidate}",
            "sha256": digest,
        }
        if resolved_from != clean_src:
            entry["source_override"] = resolved_from
        manifest.append(entry)
        new_src = f"/static/standalone_assets/{slug}/{candidate}"
        return f"{prefix}{new_src}{suffix}"

    updated_html = IMG_PATTERN.sub(repl, html)
    return updated_html, manifest


def build_template(manuscript: Manuscript, body_html: str, toc_html: str) -> str:
    breadcrumbs = [
        '<a href="/">Admin Dashboard</a>',
        f'<a href="/papers/{manuscript.document_key}">Reader View</a>',
        escape(manuscript.title),
    ]
    breadcrumb_html = "<nav class=\"standalone-breadcrumbs\" aria-label=\"Breadcrumb\"><ol>" + \
        "".join(
            f"<li>{crumb}</li>" for crumb in breadcrumbs
        ) + "</ol></nav>"
    hero_html = f"""
<header class=\"standalone-hero\">
  {breadcrumb_html}
  <div class=\"hero-heading\">
    <h1>{escape(manuscript.title)}</h1>
    <p class=\"hero-description\">{escape(manuscript.description)}</p>
  </div>
  <div class=\"hero-actions\">
    <button type=\"button\" data-action=\"print\">Print to PDF</button>
    <a class=\"secondary\" href=\"/papers/{manuscript.document_key}\">Open reader shell</a>
  </div>
</header>
"""
    toc_block = f"""
<section class=\"standalone-toc\" aria-label=\"Table of contents\">
  <div class=\"toc-header\">
    <h2>Contents</h2>
    <button type=\"button\" class=\"toc-toggle\" data-toggle=\"toc\" aria-expanded=\"true\">Collapse</button>
  </div>
  <div class=\"toc-body\">{toc_html if toc_html else '<p>No headings detected.</p>'}</div>
</section>
"""
    return f"""<!DOCTYPE html>
<html lang=\"en\">
<head>
  <meta charset=\"utf-8\" />
  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
  <title>{escape(manuscript.title)} – RFT Standalone</title>
  <link rel=\"stylesheet\" href=\"{STYLESHEET_HREF}\" />
  <script src=\"{MATHJAX_CONFIG_SRC}\"></script>
  <script defer src=\"{MATHJAX_BUNDLE_SRC}\"></script>
  <script defer src=\"{STANDALONE_JS_SRC}\"></script>
</head>
<body data-standalone-slug=\"{manuscript.slug}\">
  <a class=\"skip-link\" href=\"#main-content\">Skip to content</a>
  {hero_html}
  <div class=\"standalone-grid\">
    {toc_block}
    <main id=\"main-content\" class=\"standalone-main\">
      <article class=\"document-body\">
        {body_html}
      </article>
    </main>
  </div>
  <footer class=\"standalone-footer\">
    <p>Generated {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')} · SHA manifests located at <code>standalone_assets/{manuscript.slug}/manifest.json</code>.</p>
  </footer>
</body>
</html>
"""


def write_manifest(slug: str, manifest: List[Dict[str, str]]) -> None:
    manifest_path = ASSET_ROOT / slug / "manifest.json"
    payload = {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "asset_count": len([entry for entry in manifest if entry.get("sha256")]),
        "assets": manifest,
    }
    manifest_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")


def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Regenerate standalone HTML from the legacy body fragments."
    )
    parser.add_argument(
        "--regen",
        action="store_true",
        help="Overwrite existing standalone HTML by rebuilding from the source body fragments.",
    )
    parser.add_argument(
        "--slug",
        action="append",
        help="Limit regeneration to one or more standalone slugs (e.g. rft-volume1-final).",
    )
    return parser.parse_args()


def main() -> None:
    args = _parse_args()
    requested_slugs = set(args.slug or [])
    selected = [m for m in MANUSCRIPTS if not requested_slugs or m.slug in requested_slugs]

    if not selected:
        print("No manuscripts matched the provided slug filter.")
        return

    if not args.regen:
        print(
            "Standalone HTML is treated as the canonical manuscript now. No files were regenerated.\n"
            "Run this script with --regen to rebuild from the legacy fragments when that workflow is intentional."
        )
        return

    DEST_HTML_DIR.mkdir(parents=True, exist_ok=True)
    for manuscript in selected:
        if not manuscript.source_html.exists():
            raise FileNotFoundError(f"Missing source HTML for {manuscript.slug}: {manuscript.source_html}")
        body = manuscript.source_html.read_text(encoding="utf-8")
        if not manuscript.has_numbered_headings:
            body = normalize_volume1_headings(body)
        headings = extract_headings(body)
        toc_html = render_toc(nest_headings(headings))
        processed_body, manifest_entries = copy_assets(manuscript.slug, body)
        html_out = build_template(manuscript, processed_body, toc_html)
        output_path = DEST_HTML_DIR / f"{manuscript.slug}.html"
        output_path.write_text(html_out, encoding="utf-8")
        write_manifest(manuscript.slug, manifest_entries)
        print(f"✓ Built {output_path.relative_to(ROOT)} ({len(manifest_entries)} assets)")


if __name__ == "__main__":
    main()
