#!/usr/bin/env python3
"""
Remove AI-generated verbose sections from RFT Volume 2
Keeps: equations, technical content, results
Removes: lay summaries, why it matters, how to falsify, glossary addendums
"""

import re

def remove_ai_slop(content):
    """Remove AI-generated sections while preserving physics content"""

    # Remove template comment
    content = re.sub(r'<!-- SECTION TEMPLATE:.*?-->\n\n', '', content, flags=re.DOTALL)

    # Remove "Lay summary" sections (120-180 words pattern)
    content = re.sub(
        r'\*\*Lay summary[^\n]*\n.*?\n\n(?=\*\*(?:Why it matters|How to falsify|Equation of record)|##|\$\$)',
        '',
        content,
        flags=re.DOTALL
    )

    content = re.sub(
        r'Lay summary \([^\)]+\)\.\s+.*?\n\n(?=\*\*(?:Why it matters|How to falsify|Equation of record)|##)',
        '',
        content,
        flags=re.DOTALL
    )

    # Remove "Why it matters" bullet sections
    content = re.sub(
        r'\*\*Why it matters\.?\*\*\s*\n[-•]\s+.*?(?=\n\n\*\*(?:How to falsify|Equation of record|Glossary)|##)',
        '',
        content,
        flags=re.DOTALL
    )

    # Remove "How to falsify" bullet sections
    content = re.sub(
        r'\*\*How to falsify\.?\*\*\s*\n[-•]\s+.*?(?=\n\n\*\*(?:Equation of record|Glossary)|##)',
        '',
        content,
        flags=re.DOTALL
    )

    # Remove "Glossary addendum" lines
    content = re.sub(
        r'\*\*Glossary addendum\.\*\*[^\n]*\n',
        '',
        content
    )

    # Remove numbered glossary sections (5 key terms)
    content = re.sub(
        r'\*\*Glossary \(5 key terms\)\.\*\*\s*\n(?:\d+\.\s+\*\*[^\*]+\*\*[^\n]*\n)+',
        '',
        content
    )

    # Remove "Equation of record" headers (keep the equations themselves)
    content = re.sub(
        r'\*\*Equation of record\.\*\*\s*\n',
        '',
        content
    )

    # Remove figure/notebook/uncertainty footer callouts (too verbose for arXiv)
    content = re.sub(
        r'\*\*Figure\.\*\*[^\n]*\n',
        '',
        content
    )
    content = re.sub(
        r'\*\*Notebook bundle\.\*\*[^\n]*\n',
        '',
        content
    )
    content = re.sub(
        r'\*\*Uncertainty footer\.\*\*[^\n]*\n',
        '',
        content
    )

    # Remove "Reproduce:" lines
    content = re.sub(
        r'Reproduce:[^\n]*\n',
        '',
        content
    )

    # Remove "Provenance:" lines
    content = re.sub(
        r'Provenance:[^\n]*\n',
        '',
        content
    )

    # Remove "Falsifier." paragraphs
    content = re.sub(
        r'\*\*Falsifier\.\*\*[^\n]*\n(?:[^\n]+\n)*?\n',
        '',
        content
    )

    # Clean up multiple blank lines
    content = re.sub(r'\n{3,}', '\n\n', content)

    # Remove standalone horizontal rules that served as section separators
    content = re.sub(r'\n---\n\n', '\n\n', content)

    # Remove detailed "Why this matters" sections with bullet points (in appendix sections)
    content = re.sub(
        r'-\s+\*\*Why this matters:\*\*[^\n]*\n',
        '',
        content
    )

    # Remove "What this section covers" verbose paragraphs
    content = re.sub(
        r'What this section covers\s*\n+.*?\n\n(?=-\s+\*\*Why|Proof sketch|### |## |# )',
        '',
        content,
        flags=re.DOTALL
    )

    # Remove "What to check:" lines
    content = re.sub(
        r'\*What to check:\*[^\n]*\n',
        '',
        content
    )

    # Remove "Proof sketch / derivation outline" verbose headers
    content = re.sub(
        r'Proof sketch / derivation outline\s*\n+(?:\d+\.\s+.*?\n)+\nRun the corresponding[^\n]*\n',
        '',
        content,
        flags=re.DOTALL
    )

    # Clean up remaining emoji/button references
    content = re.sub(r'📦\s*', '', content)
    content = re.sub(r'\*\*\{\{<.*?>\}\}', '', content)

    # Final cleanup of multiple blank lines
    content = re.sub(r'\n{3,}', '\n\n', content)

    return content

def main():
    input_file = '/home/rftuser/rft-vol2-arxiv/index.qmd'

    with open(input_file, 'r') as f:
        content = f.read()

    cleaned = remove_ai_slop(content)

    with open(input_file, 'w') as f:
        f.write(cleaned)

    print(f"Cleaned {input_file}")
    print(f"Removed {len(content) - len(cleaned)} characters of AI slop")

if __name__ == '__main__':
    main()