#!/usr/bin/env python3
"""
Simple DOCX text extractor using zipfile (no external dependencies)
DOCX files are actually ZIP archives containing XML files
"""

import zipfile
import xml.etree.ElementTree as ET
import sys
import os
from pathlib import Path


def extract_text_from_docx_simple(docx_path):
    """Extract text from DOCX using built-in libraries only"""
    text_content = []
    
    try:
        # DOCX files are ZIP archives
        with zipfile.ZipFile(docx_path, 'r') as docx_zip:
            # The main document content is in word/document.xml
            with docx_zip.open('word/document.xml') as document_xml:
                tree = ET.parse(document_xml)
                root = tree.getroot()
                
                # Define namespaces
                namespaces = {
                    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
                }
                
                # Extract all text elements
                for paragraph in root.iter('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p'):
                    texts = []
                    for node in paragraph.iter('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t'):
                        if node.text:
                            texts.append(node.text)
                    if texts:
                        text_content.append(''.join(texts))
        
        return '\n\n'.join(text_content)
    
    except Exception as e:
        return f"Error extracting text from {docx_path}: {str(e)}"


def main():
    if len(sys.argv) < 2:
        print("Usage: python simple_docx_extract.py <docx_file> [output_file]")
        sys.exit(1)
    
    docx_path = Path(sys.argv[1])
    
    if not docx_path.exists():
        print(f"Error: File not found: {docx_path}")
        sys.exit(1)
    
    # Extract text
    print(f"Extracting text from: {docx_path}")
    text = extract_text_from_docx_simple(docx_path)
    
    # Output
    if len(sys.argv) > 2:
        output_path = Path(sys.argv[2])
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(f"Extracted from: {docx_path.name}\n")
            f.write("=" * 80 + "\n\n")
            f.write(text)
        print(f"Text saved to: {output_path}")
    else:
        # Print to stdout
        print("\n" + "=" * 80)
        print(text)


if __name__ == "__main__":
    main()