#!/usr/bin/env python3
"""
AI Hallucination Guard
Checks for common AI-generated text issues like references to non-existent appendices
"""

import re
import os
import sys
from pathlib import Path

# Patterns that indicate potential hallucinations
HALLUCINATION_PATTERNS = [
    # References to appendices that might not exist
    (r'(?:see|shown in|proven in|derived in|Appendix)\s+([A-Z]\d?)', 'appendix_ref'),
    # References to sections/equations that might not exist
    (r'(?:Section|Eq\.|Equation)\s+(\d+\.?\d*)', 'section_ref'),
    # Overly confident claims
    (r'(?:definitively proves|conclusively shows|unequivocally demonstrates)', 'overconfident'),
    # References to non-existent figures
    (r'(?:Figure|Fig\.)\s+(\d+)', 'figure_ref'),
    # Citations that look fake
    (r'\[(?:Smith et al\.|Jones \d{4}|Author \d{4})\]', 'fake_citation'),
]

def extract_existing_sections(content):
    """Extract actual section numbers from the document"""
    sections = set()
    # Look for section headers
    section_pattern = r'(?:^|\n)#+\s*(\d+\.?\d*)\s+'
    for match in re.finditer(section_pattern, content, re.MULTILINE):
        sections.add(match.group(1))
    return sections

def extract_existing_appendices(content):
    """Extract actual appendix labels from the document"""
    appendices = set()
    # Look for appendix headers
    appendix_pattern = r'(?:^|\n)#+\s*Appendix\s+([A-Z]\d?)'
    for match in re.finditer(appendix_pattern, content, re.MULTILINE | re.IGNORECASE):
        appendices.add(match.group(1))
    return appendices

def check_file(filepath):
    """Check a single file for potential hallucinations"""
    issues = []
    
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        return [f"Error reading file: {e}"]
    
    # Extract existing references
    existing_sections = extract_existing_sections(content)
    existing_appendices = extract_existing_appendices(content)
    
    # Check each pattern
    for pattern, pattern_type in HALLUCINATION_PATTERNS:
        for match in re.finditer(pattern, content, re.IGNORECASE):
            line_num = content[:match.start()].count('\n') + 1
            
            if pattern_type == 'appendix_ref':
                appendix_label = match.group(1)
                if appendix_label not in existing_appendices:
                    issues.append({
                        'file': filepath,
                        'line': line_num,
                        'type': 'missing_appendix',
                        'text': match.group(0),
                        'detail': f"Reference to Appendix {appendix_label} but it doesn't exist"
                    })
            
            elif pattern_type == 'section_ref':
                section_num = match.group(1)
                # Only flag if we found some sections but not this one
                if existing_sections and section_num not in existing_sections:
                    # Allow some flexibility for subsections
                    base_section = section_num.split('.')[0]
                    if base_section not in [s.split('.')[0] for s in existing_sections]:
                        issues.append({
                            'file': filepath,
                            'line': line_num,
                            'type': 'missing_section',
                            'text': match.group(0),
                            'detail': f"Reference to Section {section_num} but it doesn't exist"
                        })
            
            elif pattern_type == 'overconfident':
                issues.append({
                    'file': filepath,
                    'line': line_num,
                    'type': 'overconfident_claim',
                    'text': match.group(0),
                    'detail': "Overly confident language detected"
                })
            
            elif pattern_type == 'fake_citation':
                issues.append({
                    'file': filepath,
                    'line': line_num,
                    'type': 'suspicious_citation',
                    'text': match.group(0),
                    'detail': "Citation looks like AI placeholder"
                })
    
    return issues

def main():
    """Check all paper files for hallucinations"""
    # Find all relevant files
    paper_dirs = [
        'app/templates/papers/',
        'app/static/papers/',
        'docs/papers/'
    ]
    
    all_issues = []
    files_checked = 0
    
    for paper_dir in paper_dirs:
        if os.path.exists(paper_dir):
            for root, dirs, files in os.walk(paper_dir):
                for file in files:
                    if file.endswith(('.html', '.md', '.txt')):
                        filepath = os.path.join(root, file)
                        files_checked += 1
                        issues = check_file(filepath)
                        all_issues.extend(issues)
    
    # Report results
    print(f"AI Hallucination Guard Report")
    print(f"============================")
    print(f"Files checked: {files_checked}")
    print(f"Issues found: {len(all_issues)}")
    print()
    
    if all_issues:
        # Group by type
        by_type = {}
        for issue in all_issues:
            issue_type = issue['type']
            if issue_type not in by_type:
                by_type[issue_type] = []
            by_type[issue_type].append(issue)
        
        # Print issues by type
        for issue_type, issues in by_type.items():
            print(f"\n{issue_type.replace('_', ' ').title()} ({len(issues)} found):")
            print("-" * 60)
            for issue in issues[:5]:  # Show first 5 of each type
                print(f"  {issue['file']}:{issue['line']}")
                print(f"    Text: '{issue['text']}'")
                print(f"    Issue: {issue['detail']}")
                print()
            
            if len(issues) > 5:
                print(f"  ... and {len(issues) - 5} more")
        
        return 1  # Exit with error code
    else:
        print("✓ No potential hallucinations detected!")
        return 0

if __name__ == "__main__":
    sys.exit(main())