#!/usr/bin/env python3
"""
RFT Cosmology SEO Management Suite
==================================

Comprehensive SEO management tools including:
- Automatic sitemap generation and updates
- SEO health monitoring
- Meta tag optimization checks
- Performance monitoring
- Search console integration helpers
"""

import os
import sys
import json
import requests
import sqlite3
from datetime import datetime, timedelta
from pathlib import Path
from urllib.parse import urljoin, urlparse
import subprocess
import argparse

# Add the parent directory to Python path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

class SEOManager:
    def __init__(self, site_root="/home/rftuser/rft-cosmology-site", base_url="https://rft-cosmology.com"):
        self.site_root = Path(site_root)
        self.base_url = base_url
        self.static_dir = self.site_root / "app" / "static"
        self.reports_dir = self.site_root / "seo_reports"
        self.reports_dir.mkdir(exist_ok=True)
        
    def generate_sitemap(self):
        """Generate fresh sitemap using the sitemap generator."""
        print("🗺️  Generating fresh sitemap...")
        try:
            script_path = self.site_root / "scripts" / "generate_sitemap.py"
            result = subprocess.run([
                sys.executable, str(script_path)
            ], capture_output=True, text=True, cwd=str(self.site_root))
            
            if result.returncode == 0:
                print("✅ Sitemap generated successfully")
                return True, result.stdout
            else:
                print(f"❌ Sitemap generation failed: {result.stderr}")
                return False, result.stderr
        except Exception as e:
            print(f"❌ Exception during sitemap generation: {e}")
            return False, str(e)
    
    def validate_sitemap(self):
        """Validate the generated sitemap for common issues."""
        print("🔍 Validating sitemap...")
        sitemap_path = self.static_dir / "sitemap.xml"
        
        if not sitemap_path.exists():
            return False, "Sitemap file not found"
        
        try:
            import xml.etree.ElementTree as ET
            tree = ET.parse(sitemap_path)
            root = tree.getroot()
            
            issues = []
            urls = []
            
            # Check namespace
            if 'sitemaps.org' not in root.tag:
                issues.append("Invalid sitemap namespace")
            
            # Check URL elements
            for url_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
                loc_elem = url_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
                if loc_elem is not None:
                    url = loc_elem.text
                    urls.append(url)
                    
                    # Validate URL format
                    if not url.startswith(self.base_url):
                        issues.append(f"URL doesn't match base URL: {url}")
                        
                    # Check for common issues
                    if '#' in url:
                        issues.append(f"URL contains fragment: {url}")
                    if '?' in url and 'utm_' in url:
                        issues.append(f"URL contains tracking parameters: {url}")
            
            # Check for duplicates
            duplicates = set([url for url in urls if urls.count(url) > 1])
            if duplicates:
                issues.append(f"Duplicate URLs found: {list(duplicates)}")
            
            if issues:
                return False, f"Validation issues: {'; '.join(issues)}"
            else:
                return True, f"Sitemap valid with {len(urls)} URLs"
                
        except Exception as e:
            return False, f"XML parsing error: {e}"
    
    def check_robots_txt(self):
        """Validate robots.txt file."""
        print("🤖 Checking robots.txt...")
        robots_path = self.static_dir / "robots.txt"
        
        if not robots_path.exists():
            return False, "robots.txt file not found"
        
        try:
            with open(robots_path, 'r') as f:
                content = f.read()
            
            issues = []
            
            # Check for required elements
            if 'User-agent:' not in content:
                issues.append("No User-agent directive found")
            if 'Sitemap:' not in content:
                issues.append("No Sitemap directive found")
            
            # Check sitemap URL
            if self.base_url not in content:
                issues.append("Sitemap URL doesn't match base URL")
            
            # Check for common mistakes
            if 'Disallow: /' in content and 'Allow:' not in content:
                issues.append("Disallowing root without specific allows")
            
            if issues:
                return False, f"robots.txt issues: {'; '.join(issues)}"
            else:
                return True, "robots.txt is properly configured"
                
        except Exception as e:
            return False, f"Error reading robots.txt: {e}"
    
    def check_site_health(self, max_checks=10):
        """Check if key pages are accessible and have proper meta tags."""
        print("🏥 Checking site health...")
        
        key_pages = [
            '/',
            '/theory',
            '/evidence',
            '/papers',
            '/simulators',
            '/forum',
            '/introduction',
            '/quick-start',
            '/math-reference',
            '/testable-predictions'
        ]
        
        results = []
        
        for page in key_pages[:max_checks]:
            url = urljoin(self.base_url, page)
            try:
                response = requests.get(url, timeout=10, headers={
                    'User-Agent': 'RFT-SEO-Monitor/1.0'
                })
                
                result = {
                    'url': url,
                    'status_code': response.status_code,
                    'response_time': response.elapsed.total_seconds(),
                    'issues': []
                }
                
                if response.status_code != 200:
                    result['issues'].append(f"HTTP {response.status_code}")
                else:
                    content = response.text
                    
                    # Check for essential meta tags
                    if '<title>' not in content:
                        result['issues'].append("Missing title tag")
                    if 'meta name="description"' not in content:
                        result['issues'].append("Missing meta description")
                    if 'meta property="og:title"' not in content:
                        result['issues'].append("Missing Open Graph title")
                    if 'canonical' not in content:
                        result['issues'].append("Missing canonical URL")
                    
                    # Check for structured data
                    if 'application/ld+json' not in content and page == '/':
                        result['issues'].append("Missing structured data on homepage")
                
                results.append(result)
                
            except requests.RequestException as e:
                results.append({
                    'url': url,
                    'status_code': 0,
                    'response_time': 0,
                    'issues': [f"Request failed: {str(e)}"]
                })
        
        # Analyze results
        total_checks = len(results)
        healthy_pages = len([r for r in results if not r['issues'] and r['status_code'] == 200])
        avg_response_time = sum(r['response_time'] for r in results) / total_checks if results else 0
        
        health_score = (healthy_pages / total_checks) * 100 if total_checks > 0 else 0
        
        return {
            'health_score': health_score,
            'total_checks': total_checks,
            'healthy_pages': healthy_pages,
            'avg_response_time': avg_response_time,
            'detailed_results': results
        }
    
    def analyze_forum_seo(self):
        """Analyze forum content for SEO optimization."""
        print("💬 Analyzing forum SEO...")
        
        try:
            db_path = self.site_root / "instance" / "forum.db"
            if not db_path.exists():
                return {"error": "Forum database not found"}
            
            conn = sqlite3.connect(db_path)
            cursor = conn.cursor()
            
            # Get forum statistics
            cursor.execute("SELECT COUNT(*) FROM posts WHERE approved = 1 AND parent_id IS NULL")
            total_posts = cursor.fetchone()[0]
            
            cursor.execute("SELECT COUNT(*) FROM posts WHERE approved = 1 AND parent_id IS NOT NULL")
            total_replies = cursor.fetchone()[0]
            
            # Get recent activity
            thirty_days_ago = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
            cursor.execute("SELECT COUNT(*) FROM posts WHERE approved = 1 AND timestamp > ?", (thirty_days_ago,))
            recent_posts = cursor.fetchone()[0]
            
            # Get top categories
            cursor.execute('''
                SELECT category, COUNT(*) as count
                FROM posts 
                WHERE approved = 1 AND parent_id IS NULL
                GROUP BY category 
                ORDER BY count DESC 
                LIMIT 5
            ''')
            top_categories = cursor.fetchall()
            
            # Get posts with good engagement
            cursor.execute('''
                SELECT COUNT(*) FROM posts 
                WHERE approved = 1 AND parent_id IS NULL 
                AND (upvotes - downvotes) >= 5
            ''')
            high_engagement_posts = cursor.fetchone()[0]
            
            conn.close()
            
            return {
                'total_posts': total_posts,
                'total_replies': total_replies,
                'recent_posts': recent_posts,
                'high_engagement_posts': high_engagement_posts,
                'top_categories': top_categories,
                'indexable_posts': min(total_posts, 100)  # Limit for sitemap
            }
            
        except Exception as e:
            return {"error": f"Database analysis failed: {e}"}
    
    def generate_seo_report(self):
        """Generate comprehensive SEO report."""
        print("📊 Generating SEO report...")
        
        report = {
            'timestamp': datetime.now().isoformat(),
            'sitemap': {},
            'robots': {},
            'site_health': {},
            'forum_seo': {},
            'recommendations': []
        }
        
        # Sitemap analysis
        sitemap_success, sitemap_msg = self.generate_sitemap()
        sitemap_valid, sitemap_validation = self.validate_sitemap()
        
        report['sitemap'] = {
            'generated': sitemap_success,
            'generation_message': sitemap_msg,
            'valid': sitemap_valid,
            'validation_message': sitemap_validation
        }
        
        # Robots.txt analysis
        robots_valid, robots_msg = self.check_robots_txt()
        report['robots'] = {
            'valid': robots_valid,
            'message': robots_msg
        }
        
        # Site health check
        report['site_health'] = self.check_site_health()
        
        # Forum SEO analysis
        report['forum_seo'] = self.analyze_forum_seo()
        
        # Generate recommendations
        recommendations = []
        
        if not sitemap_success:
            recommendations.append("Fix sitemap generation issues")
        if not sitemap_valid:
            recommendations.append("Resolve sitemap validation errors")
        if not robots_valid:
            recommendations.append("Fix robots.txt configuration")
        if report['site_health']['health_score'] < 90:
            recommendations.append("Improve site health - fix page errors and missing meta tags")
        if report['site_health']['avg_response_time'] > 2.0:
            recommendations.append("Optimize page load times")
        if report['forum_seo'].get('recent_posts', 0) < 5:
            recommendations.append("Increase forum activity for better content freshness")
        
        report['recommendations'] = recommendations
        
        # Save report
        report_filename = f"seo_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        report_path = self.reports_dir / report_filename
        
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=2)
        
        return report, report_path
    
    def create_search_console_urls(self):
        """Generate URLs for Google Search Console submission."""
        print("🔍 Creating Search Console URL list...")
        
        sitemap_path = self.static_dir / "sitemap.xml"
        if not sitemap_path.exists():
            return []
        
        try:
            import xml.etree.ElementTree as ET
            tree = ET.parse(sitemap_path)
            root = tree.getroot()
            
            urls = []
            for url_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
                loc_elem = url_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
                if loc_elem is not None:
                    urls.append(loc_elem.text)
            
            # Save URL list for Search Console
            urls_file = self.reports_dir / "search_console_urls.txt"
            with open(urls_file, 'w') as f:
                for url in urls:
                    f.write(f"{url}\n")
            
            print(f"📝 Saved {len(urls)} URLs to {urls_file}")
            return urls
            
        except Exception as e:
            print(f"❌ Error creating URL list: {e}")
            return []

def main():
    parser = argparse.ArgumentParser(description='RFT Cosmology SEO Manager')
    parser.add_argument('--generate-sitemap', action='store_true', help='Generate sitemap only')
    parser.add_argument('--validate', action='store_true', help='Validate SEO files only')
    parser.add_argument('--health-check', action='store_true', help='Run site health check only')
    parser.add_argument('--forum-analysis', action='store_true', help='Analyze forum SEO only')
    parser.add_argument('--full-report', action='store_true', help='Generate full SEO report')
    parser.add_argument('--search-console', action='store_true', help='Generate Search Console URL list')
    
    args = parser.parse_args()
    
    seo_manager = SEOManager()
    
    if args.generate_sitemap:
        success, message = seo_manager.generate_sitemap()
        print(f"Result: {message}")
    elif args.validate:
        sitemap_valid, sitemap_msg = seo_manager.validate_sitemap()
        robots_valid, robots_msg = seo_manager.check_robots_txt()
        print(f"Sitemap: {'✅' if sitemap_valid else '❌'} {sitemap_msg}")
        print(f"Robots.txt: {'✅' if robots_valid else '❌'} {robots_msg}")
    elif args.health_check:
        health = seo_manager.check_site_health()
        print(f"Site Health Score: {health['health_score']:.1f}%")
        print(f"Healthy Pages: {health['healthy_pages']}/{health['total_checks']}")
        print(f"Avg Response Time: {health['avg_response_time']:.2f}s")
    elif args.forum_analysis:
        forum_data = seo_manager.analyze_forum_seo()
        if 'error' in forum_data:
            print(f"❌ {forum_data['error']}")
        else:
            print(f"Forum Posts: {forum_data['total_posts']}")
            print(f"Recent Activity: {forum_data['recent_posts']} posts (30 days)")
            print(f"High Engagement: {forum_data['high_engagement_posts']} posts")
    elif args.search_console:
        urls = seo_manager.create_search_console_urls()
        print(f"Generated {len(urls)} URLs for Search Console")
    else:
        # Default: generate full report
        print("🚀 Generating comprehensive SEO report...")
        report, report_path = seo_manager.generate_seo_report()
        
        print(f"\n📊 SEO Report Summary:")
        print(f"Sitemap: {'✅' if report['sitemap']['generated'] else '❌'}")
        print(f"Robots.txt: {'✅' if report['robots']['valid'] else '❌'}")
        print(f"Site Health: {report['site_health']['health_score']:.1f}%")
        print(f"Forum Posts: {report['forum_seo'].get('total_posts', 'N/A')}")
        
        if report['recommendations']:
            print(f"\n💡 Recommendations:")
            for i, rec in enumerate(report['recommendations'], 1):
                print(f"  {i}. {rec}")
        else:
            print(f"\n🎉 No major SEO issues found!")
        
        print(f"\n📋 Full report saved to: {report_path}")

if __name__ == "__main__":
    main()