#!/usr/bin/env python3
"""
Citation linter for RFT website
Ensures all decisive claims have proper citations
"""

import re
import sys
import os
from pathlib import Path
from typing import List, Tuple, Dict
import html
from bs4 import BeautifulSoup

# Trigger words that require citations
TRIGGER_PATTERNS = [
    r"\bproves?\b",
    r"\bshows?\s+that\b",
    r"\bexact(ly)?\b",
    r"\bdemonstrates?\b",
    r"\bestablishes?\b",
    r"\bpredicts?\b",
    r"\bpredicted\b",
    r"\bshow(s|ed)\s+unambiguously\b",
    r"\bgives?\s+the\s+exact\b"
]

# Citation patterns to look for
CITATION_PATTERNS = [
    r'<sup>\[[\d,\s]+\]</sup>',  # Superscript citations
    r'\[<a[^>]*>[^<]+</a>\]',      # Link citations
    r'<cite[^>]*>.*?</cite>',      # Cite tags
    r'\(\w+\s+\d{4}\)',            # Author Year format
    r'<span[^>]*class="[^"]*citation[^"]*"[^>]*>.*?</span>'  # Citation spans
]

# Files to exclude from checking
EXCLUDE_PATTERNS = [
    '*/admin_*',
    '*/auth/*',
    '*/error/*',
    '*/_includes/*',
    '*/_macros/*',
    '*/base.html',
    '*/changelog.html'
]

class CitationChecker:
    def __init__(self, base_dir: str):
        self.base_dir = Path(base_dir)
        self.errors: List[Dict] = []
        self.warnings: List[Dict] = []
        
    def check_file(self, filepath: Path) -> None:
        """Check a single HTML file for uncited claims"""
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
        except Exception as e:
            print(f"Error reading {filepath}: {e}")
            return
            
        # Parse with BeautifulSoup
        soup = BeautifulSoup(content, 'html.parser')
        
        # Remove script and style tags
        for tag in soup(['script', 'style', 'code', 'pre']):
            tag.decompose()
            
        # Check each paragraph
        for p in soup.find_all(['p', 'li', 'div']):
            text = p.get_text(strip=True)
            if not text:
                continue
                
            # Check for trigger words
            for pattern in TRIGGER_PATTERNS:
                if re.search(pattern, text, re.IGNORECASE):
                    # Look for citations in the same element
                    html_str = str(p)
                    has_citation = any(
                        re.search(cit_pattern, html_str) 
                        for cit_pattern in CITATION_PATTERNS
                    )
                    
                    if not has_citation:
                        # Get line number (approximate)
                        line_num = content[:content.find(str(p))].count('\n') + 1
                        
                        # Extract context
                        match = re.search(pattern, text, re.IGNORECASE)
                        start = max(0, match.start() - 40)
                        end = min(len(text), match.end() + 40)
                        context = text[start:end]
                        if start > 0:
                            context = "..." + context
                        if end < len(text):
                            context = context + "..."
                            
                        self.errors.append({
                            'file': str(filepath.relative_to(self.base_dir)),
                            'line': line_num,
                            'pattern': pattern,
                            'context': context,
                            'severity': 'error'
                        })
                        break  # Only report once per element
    
    def should_check_file(self, filepath: Path) -> bool:
        """Check if file should be analyzed"""
        filepath_str = str(filepath)
        
        # Check exclude patterns
        for pattern in EXCLUDE_PATTERNS:
            if Path(filepath_str).match(pattern):
                return False
                
        # Only check HTML files
        return filepath.suffix == '.html'
    
    def check_directory(self, directory: Path) -> None:
        """Recursively check all HTML files in directory"""
        for filepath in directory.rglob('*.html'):
            if self.should_check_file(filepath):
                self.check_file(filepath)
    
    def print_report(self) -> int:
        """Print report and return exit code"""
        if not self.errors and not self.warnings:
            print("✅ All decisive claims have citations!")
            return 0
            
        print(f"\n🔍 Citation Check Report")
        print(f"{'='*60}")
        
        if self.warnings:
            print(f"\n⚠️  Warnings ({len(self.warnings)}):")
            for w in self.warnings:
                print(f"  {w['file']}:{w['line']} - {w['context']}")
                
        if self.errors:
            print(f"\n❌ Errors ({len(self.errors)}):")
            for e in self.errors:
                print(f"  {e['file']}:{e['line']}")
                print(f"    Pattern: {e['pattern']}")
                print(f"    Context: {e['context']}")
                print()
                
        print(f"\nSummary: {len(self.errors)} uncited claims found")
        
        # Fail CI if more than 2 uncited claims
        return 1 if len(self.errors) > 2 else 0

def main():
    """Main entry point"""
    # Determine base directory
    script_dir = Path(__file__).parent
    base_dir = script_dir.parent / 'app' / 'templates'
    
    if not base_dir.exists():
        print(f"Error: Template directory not found at {base_dir}")
        sys.exit(1)
        
    # Run checker
    checker = CitationChecker(base_dir.parent)
    checker.check_directory(base_dir)
    
    # Print report and exit
    exit_code = checker.print_report()
    sys.exit(exit_code)

if __name__ == '__main__':
    main()