#!/usr/bin/env python3
"""
RFT Cosmology Sitemap Generator
==============================

Automatically generates sitemap.xml for SEO optimization.
Includes all papers, simulators, forum content, and static pages.
"""

import os
import sys
import json
import sqlite3
from datetime import datetime, timedelta
from urllib.parse import urljoin
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom import minidom

# Add the parent directory to Python path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

try:
    from app.papers_data import FEATURED_PAPERS, ARCHIVE_PAPERS
except ImportError:
    # Fallback if import fails
    FEATURED_PAPERS = []
    ARCHIVE_PAPERS = []

# Configuration
BASE_URL = "https://rft-cosmology.com"
SITE_ROOT = "/home/rftuser/rft-cosmology-site"

class SitemapGenerator:
    def __init__(self):
        self.base_url = BASE_URL
        self.site_root = SITE_ROOT
        self.now = datetime.now()
        
    def create_url_element(self, parent, loc, lastmod=None, changefreq=None, priority=None):
        """Create a URL element for the sitemap."""
        url = SubElement(parent, 'url')
        
        # Location (required)
        SubElement(url, 'loc').text = urljoin(self.base_url, loc)
        
        # Last modification date
        if lastmod:
            if isinstance(lastmod, datetime):
                lastmod = lastmod.isoformat()
            elif isinstance(lastmod, str):
                # Ensure proper ISO format
                try:
                    dt = datetime.fromisoformat(lastmod.replace('Z', '+00:00'))
                    lastmod = dt.isoformat()
                except:
                    lastmod = self.now.isoformat()
            SubElement(url, 'lastmod').text = lastmod
        
        # Change frequency
        if changefreq:
            SubElement(url, 'changefreq').text = changefreq
            
        # Priority
        if priority:
            SubElement(url, 'priority').text = str(priority)
            
        return url

    def get_static_pages(self):
        """Get all static pages with their priorities and change frequencies."""
        static_pages = [
            # Core pages (highest priority)
            {'url': '/', 'priority': 1.0, 'changefreq': 'weekly'},
            {'url': '/theory', 'priority': 1.0, 'changefreq': 'monthly'},
            {'url': '/evidence', 'priority': 1.0, 'changefreq': 'monthly'},
            {'url': '/papers', 'priority': 1.0, 'changefreq': 'weekly'},
            
            # Educational content (high priority)
            {'url': '/introduction', 'priority': 0.9, 'changefreq': 'monthly'},
            {'url': '/quick-start', 'priority': 0.9, 'changefreq': 'monthly'},
            {'url': '/advanced-quick-start', 'priority': 0.9, 'changefreq': 'monthly'},
            {'url': '/math-reference', 'priority': 0.9, 'changefreq': 'monthly'},
            {'url': '/testable-predictions', 'priority': 0.9, 'changefreq': 'weekly'},
            {'url': '/glossary', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/quantum', 'priority': 1.0, 'changefreq': 'weekly'},  # New quantum section
            
            # Interactive content
            {'url': '/simulators', 'priority': 0.9, 'changefreq': 'weekly'},
            {'url': '/cosmology/cmb-explorer', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/predictions-dashboard', 'priority': 0.8, 'changefreq': 'weekly'},
            {'url': '/equation-explorer', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/data-watch', 'priority': 0.8, 'changefreq': 'daily'},
            
            # Tools and calculators
            {'url': '/calculators/neutrino-mass', 'priority': 0.7, 'changefreq': 'monthly'},
            {'url': '/calculators/black-hole-echoes', 'priority': 0.7, 'changefreq': 'monthly'},
            {'url': '/calculators/dark-matter-profiles', 'priority': 0.7, 'changefreq': 'monthly'},
            
            # Interactive demos
            {'url': '/twistor-bundle-demo', 'priority': 0.7, 'changefreq': 'monthly'},
            {'url': '/scalaron-screening', 'priority': 0.7, 'changefreq': 'monthly'},
            {'url': '/structure-timeline', 'priority': 0.7, 'changefreq': 'monthly'},
            
            # Specific content pages
            {'url': '/vacuum_energy', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/sm_derivations', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/mathematical-methods', 'priority': 0.8, 'changefreq': 'monthly'},
            
            # Community and support
            {'url': '/forum', 'priority': 0.8, 'changefreq': 'daily'},
            {'url': '/blog', 'priority': 0.7, 'changefreq': 'weekly'},
            {'url': '/about', 'priority': 0.6, 'changefreq': 'yearly'},
            {'url': '/faqs', 'priority': 0.6, 'changefreq': 'monthly'},
        ]
        
        return static_pages

    def get_paper_pages(self):
        """Get all paper pages from the papers database."""
        paper_pages = []
        
        # Featured papers (highest priority)
        for paper in FEATURED_PAPERS:
            if 'slug' in paper:
                paper_pages.append({
                    'url': f"/papers/{paper['slug']}",
                    'priority': 0.9,
                    'changefreq': 'monthly',
                    'lastmod': paper.get('date', self.now.isoformat())
                })
                
                # Also add toggle versions if they exist
                if 'paper_id' in paper:
                    paper_pages.append({
                        'url': f"/papers/rft/{paper['paper_id']}",
                        'priority': 0.9,
                        'changefreq': 'monthly',
                        'lastmod': paper.get('date', self.now.isoformat())
                    })
        
        # Archive papers (high priority)
        for paper in ARCHIVE_PAPERS:
            if 'slug' in paper:
                paper_pages.append({
                    'url': f"/papers/{paper['slug']}",
                    'priority': 0.8,
                    'changefreq': 'monthly',
                    'lastmod': paper.get('date', self.now.isoformat())
                })
                
                # Also add toggle versions if they exist
                if 'paper_id' in paper:
                    paper_pages.append({
                        'url': f"/papers/rft/{paper['paper_id']}",
                        'priority': 0.8,
                        'changefreq': 'monthly',
                        'lastmod': paper.get('date', self.now.isoformat())
                    })
        
        # Add special paper pages
        special_papers = [
            {'url': '/papers/arrow-of-time', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/papers/15x', 'priority': 0.9, 'changefreq': 'weekly'},  # 15.x series page
        ]
        paper_pages.extend(special_papers)
        
        return paper_pages

    def get_simulator_pages(self):
        """Get all simulator and interactive tool pages."""
        simulator_pages = [
            # Main simulators
            {'url': '/simulator/core-cusp', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/simulator/rotation-curves', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/simulator/frg-flow', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/simulator/scalaron-playground', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/simulator/jeans-playground', 'priority': 0.7, 'changefreq': 'monthly'},
            {'url': '/simulator/rft-gravity', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/simulator/gravity-comparison', 'priority': 0.8, 'changefreq': 'monthly'},
            {'url': '/simulators/rft-gravity-enhanced', 'priority': 0.8, 'changefreq': 'monthly'},
            
            # Background simulators
            {'url': '/simulator/background', 'priority': 0.7, 'changefreq': 'monthly'},
            {'url': '/simulator/frg', 'priority': 0.7, 'changefreq': 'monthly'},
            {'url': '/simulator/perturbations', 'priority': 0.7, 'changefreq': 'monthly'},
            {'url': '/simulator/gauge', 'priority': 0.7, 'changefreq': 'monthly'},
        ]
        
        return simulator_pages

    def get_forum_pages(self):
        """Get approved forum posts from the database."""
        forum_pages = []
        
        try:
            db_path = os.path.join(self.site_root, 'instance', 'forum.db')
            if os.path.exists(db_path):
                conn = sqlite3.connect(db_path)
                cursor = conn.cursor()
                
                # Get approved main posts (not replies) from last 6 months
                six_months_ago = (self.now - timedelta(days=180)).strftime('%Y-%m-%d')
                
                cursor.execute('''
                    SELECT id, timestamp, last_activity 
                    FROM posts 
                    WHERE approved = 1 
                    AND parent_id IS NULL 
                    AND timestamp > ?
                    AND upvotes - downvotes >= 0
                    ORDER BY timestamp DESC
                    LIMIT 100
                ''', (six_months_ago,))
                
                posts = cursor.fetchall()
                
                for post_id, timestamp, last_activity in posts:
                    forum_pages.append({
                        'url': f'/forum/post/{post_id}',
                        'priority': 0.6,
                        'changefreq': 'weekly',
                        'lastmod': last_activity or timestamp
                    })
                
                conn.close()
                
        except Exception as e:
            print(f"Warning: Could not access forum database: {e}")
        
        return forum_pages

    def get_static_files(self):
        """Get important static files that should be indexed."""
        static_files = []
        
        # Important data files for academic indexing
        data_files = [
            '/static/data/rft_predictions.json',
            '/static/data/pdg_masses_2025.csv',
            '/static/data/twistor_Yukawa_params.json',
        ]
        
        for file_path in data_files:
            full_path = os.path.join(self.site_root, file_path.lstrip('/'))
            if os.path.exists(full_path):
                mtime = datetime.fromtimestamp(os.path.getmtime(full_path))
                static_files.append({
                    'url': file_path,
                    'priority': 0.5,
                    'changefreq': 'monthly',
                    'lastmod': mtime.isoformat()
                })
        
        return static_files

    def generate_sitemap(self):
        """Generate the complete sitemap XML."""
        print("Generating sitemap for RFT Cosmology...")
        
        # Create root element
        urlset = Element('urlset')
        urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
        urlset.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
        urlset.set('xsi:schemaLocation', 'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd')
        
        url_count = 0
        
        # Add static pages
        print("Adding static pages...")
        static_pages = self.get_static_pages()
        for page in static_pages:
            self.create_url_element(
                urlset, 
                page['url'], 
                lastmod=self.now.isoformat(),
                changefreq=page['changefreq'], 
                priority=page['priority']
            )
            url_count += 1
        
        # Add paper pages
        print("Adding paper pages...")
        paper_pages = self.get_paper_pages()
        for page in paper_pages:
            self.create_url_element(
                urlset,
                page['url'],
                lastmod=page.get('lastmod'),
                changefreq=page['changefreq'],
                priority=page['priority']
            )
            url_count += 1
        
        # Add simulator pages
        print("Adding simulator pages...")
        simulator_pages = self.get_simulator_pages()
        for page in simulator_pages:
            self.create_url_element(
                urlset,
                page['url'],
                lastmod=self.now.isoformat(),
                changefreq=page['changefreq'],
                priority=page['priority']
            )
            url_count += 1
        
        # Add forum pages
        print("Adding forum pages...")
        forum_pages = self.get_forum_pages()
        for page in forum_pages:
            self.create_url_element(
                urlset,
                page['url'],
                lastmod=page.get('lastmod'),
                changefreq=page['changefreq'],
                priority=page['priority']
            )
            url_count += 1
        
        # Add static files
        print("Adding important static files...")
        static_files = self.get_static_files()
        for page in static_files:
            self.create_url_element(
                urlset,
                page['url'],
                lastmod=page.get('lastmod'),
                changefreq=page['changefreq'],
                priority=page['priority']
            )
            url_count += 1
        
        # Convert to pretty XML
        rough_string = tostring(urlset, 'utf-8')
        reparsed = minidom.parseString(rough_string)
        pretty_xml = reparsed.toprettyxml(indent='  ')
        
        # Remove empty lines
        pretty_xml = '\n'.join([line for line in pretty_xml.split('\n') if line.strip()])
        
        return pretty_xml, url_count

    def save_sitemap(self, xml_content):
        """Save the sitemap to the static directory."""
        static_dir = os.path.join(self.site_root, 'app', 'static')
        os.makedirs(static_dir, exist_ok=True)
        
        sitemap_path = os.path.join(static_dir, 'sitemap.xml')
        
        with open(sitemap_path, 'w', encoding='utf-8') as f:
            f.write(xml_content)
        
        return sitemap_path

def main():
    """Main function to generate and save sitemap."""
    generator = SitemapGenerator()
    
    try:
        xml_content, url_count = generator.generate_sitemap()
        sitemap_path = generator.save_sitemap(xml_content)
        
        print(f"✅ Sitemap generated successfully!")
        print(f"📍 Location: {sitemap_path}")
        print(f"📊 URLs included: {url_count}")
        print(f"🌐 Accessible at: {BASE_URL}/static/sitemap.xml")
        
        # Also save a backup with timestamp
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        backup_path = sitemap_path.replace('.xml', f'_{timestamp}.xml')
        with open(backup_path, 'w', encoding='utf-8') as f:
            f.write(xml_content)
        print(f"📋 Backup saved: {backup_path}")
        
        return True
        
    except Exception as e:
        print(f"❌ Error generating sitemap: {e}")
        import traceback
        traceback.print_exc()
        return False

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)