#!/usr/bin/env python3
"""
Convert DOCX files to readable text format
Requires: pip install python-docx
"""

import os
import sys
from pathlib import Path

try:
    from docx import Document
except ImportError:
    print("Error: python-docx not installed")
    print("Please run: pip install python-docx")
    sys.exit(1)


def extract_text_from_docx(docx_path):
    """Extract all text from a DOCX file"""
    try:
        doc = Document(docx_path)
        full_text = []
        
        # Extract text from paragraphs
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                full_text.append(paragraph.text)
        
        # Extract text from tables
        for table in doc.tables:
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    if cell.text.strip():
                        row_text.append(cell.text.strip())
                if row_text:
                    full_text.append(" | ".join(row_text))
        
        return "\n\n".join(full_text)
    
    except Exception as e:
        return f"Error reading {docx_path}: {str(e)}"


def convert_docx_to_text(docx_path, output_path=None):
    """Convert a DOCX file to text format"""
    docx_path = Path(docx_path)
    
    if not docx_path.exists():
        print(f"Error: File not found: {docx_path}")
        return False
    
    # Extract text
    text_content = extract_text_from_docx(docx_path)
    
    # Determine output path
    if output_path is None:
        output_path = docx_path.with_suffix('.txt')
    else:
        output_path = Path(output_path)
    
    # Write text file
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(f"Extracted from: {docx_path.name}\n")
            f.write("=" * 80 + "\n\n")
            f.write(text_content)
        
        print(f"✓ Converted: {docx_path.name} -> {output_path.name}")
        return True
    
    except Exception as e:
        print(f"Error writing {output_path}: {str(e)}")
        return False


def batch_convert_directory(directory_path, output_dir=None):
    """Convert all DOCX files in a directory"""
    directory_path = Path(directory_path)
    
    if not directory_path.exists():
        print(f"Error: Directory not found: {directory_path}")
        return
    
    # Create output directory if specified
    if output_dir:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    
    # Find all DOCX files
    docx_files = list(directory_path.glob("*.docx"))
    
    if not docx_files:
        print(f"No DOCX files found in {directory_path}")
        return
    
    print(f"Found {len(docx_files)} DOCX files to convert...")
    
    # Convert each file
    success_count = 0
    for docx_file in docx_files:
        if output_dir:
            output_path = output_dir / docx_file.with_suffix('.txt').name
        else:
            output_path = None
        
        if convert_docx_to_text(docx_file, output_path):
            success_count += 1
    
    print(f"\nConversion complete: {success_count}/{len(docx_files)} files converted successfully")


def main():
    """Main entry point"""
    if len(sys.argv) < 2:
        print("Usage:")
        print("  Convert single file:  python convert_docx.py <docx_file> [output_file]")
        print("  Convert directory:    python convert_docx.py --batch <directory> [output_directory]")
        print("\nExample:")
        print("  python convert_docx.py 'RFT 13.1.docx'")
        print("  python convert_docx.py --batch '/path/to/papers' '/path/to/output'")
        sys.exit(1)
    
    if sys.argv[1] == "--batch":
        # Batch conversion
        if len(sys.argv) < 3:
            print("Error: Please specify a directory for batch conversion")
            sys.exit(1)
        
        directory = sys.argv[2]
        output_dir = sys.argv[3] if len(sys.argv) > 3 else None
        batch_convert_directory(directory, output_dir)
    
    else:
        # Single file conversion
        docx_file = sys.argv[1]
        output_file = sys.argv[2] if len(sys.argv) > 2 else None
        convert_docx_to_text(docx_file, output_file)


if __name__ == "__main__":
    main()