#!/usr/bin/env python3
"""
Elasticsearch Index Backup Script
==================================
Backs up an Elasticsearch index to a single compressed file containing:
- Index mapping and settings
- All documents

Usage:
    python backup_index.py
    python backup_index.py --index arabic_research
    python backup_index.py --index arabic_research --output ./my_backups
"""

import os
import sys
import json
import gzip
import argparse
from datetime import datetime
from pathlib import Path
from typing import Dict, Any

try:
    from elasticsearch import Elasticsearch
    from elasticsearch.helpers import scan
except ImportError:
    print("ERROR: elasticsearch package not installed.")
    print("Run: pip install elasticsearch")
    sys.exit(1)

try:
    from dotenv import load_dotenv
except ImportError:
    load_dotenv = lambda: None


def load_config() -> Dict[str, str]:
    """Load configuration from .env file or environment variables."""
    load_dotenv()
    
    return {
        'es_host': os.getenv('TARGET_ES_HOST', 'http://localhost:9200'),
        'index': os.getenv('TARGET_INDEX', 'arabic_research'),
        'backup_dir': os.getenv('BACKUP_DIR', './backups'),
    }


def connect_elasticsearch(host: str) -> Elasticsearch:
    """Create Elasticsearch connection."""
    es = Elasticsearch(
        hosts=[host],
        verify_certs=False,
        request_timeout=300,
    )
    
    if not es.ping():
        raise ConnectionError(f"Cannot connect to Elasticsearch at {host}")
    
    info = es.info()
    print(f"✅ Connected to Elasticsearch {info['version']['number']}")
    return es


def get_index_info(es: Elasticsearch, index: str) -> Dict[str, Any]:
    """Get index mapping, settings, and stats."""
    if not es.indices.exists(index=index):
        raise ValueError(f"Index '{index}' does not exist")
    
    mapping = es.indices.get_mapping(index=index)
    settings = es.indices.get_settings(index=index)
    stats = es.indices.stats(index=index)
    
    doc_count = stats['indices'][index]['primaries']['docs']['count']
    size_bytes = stats['indices'][index]['primaries']['store']['size_in_bytes']
    size_mb = size_bytes / (1024 * 1024)
    
    print(f"📊 Index: {index}")
    print(f"   Documents: {doc_count:,}")
    print(f"   Size: {size_mb:.2f} MB")
    
    return {
        'mapping': mapping[index]['mappings'],
        'settings': settings[index]['settings'],
        'doc_count': doc_count,
        'size_bytes': size_bytes,
    }


def clean_surrogates(obj):
    """Recursively clean surrogate characters from strings in an object."""
    if isinstance(obj, str):
        # Remove surrogate characters that can't be encoded in UTF-8
        return obj.encode('utf-8', errors='surrogatepass').decode('utf-8', errors='replace')
    elif isinstance(obj, dict):
        return {k: clean_surrogates(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [clean_surrogates(item) for item in obj]
    return obj


def backup_index(es: Elasticsearch, index: str, output_dir: str) -> str:
    """
    Backup an Elasticsearch index to a single compressed file.
    
    File format (JSONL, gzipped):
    - Line 1: Header with metadata, mapping, settings
    - Lines 2+: Documents (one per line)
    
    Returns path to backup file.
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    backup_file = output_path / f"{index}_backup_{timestamp}.json.gz"
    
    print(f"\n📋 Fetching index information...")
    index_info = get_index_info(es, index)
    
    print(f"\n📥 Backing up to: {backup_file}")
    
    with gzip.open(backup_file, 'wt', encoding='utf-8') as f:
        # Write header with metadata, mapping, and settings
        header = {
            'type': 'header',
            'index': index,
            'backup_timestamp': datetime.now().isoformat(),
            'doc_count': index_info['doc_count'],
            'mapping': index_info['mapping'],
            'settings': index_info['settings'],
        }
        f.write(json.dumps(header, ensure_ascii=False) + '\n')
        
        # Write all documents
        doc_count = 0
        query = {"query": {"match_all": {}}}
        
        for doc in scan(es, index=index, query=query, scroll='5m', size=1000):
            doc_record = {
                'type': 'document',
                '_id': doc['_id'],
                '_source': clean_surrogates(doc['_source']),
            }
            f.write(json.dumps(doc_record, ensure_ascii=False) + '\n')
            doc_count += 1
            
            if doc_count % 1000 == 0:
                print(f"   Processed {doc_count:,} documents...", end='\r')
    
    # Get file size
    file_size_mb = backup_file.stat().st_size / (1024 * 1024)
    
    print(f"\n\n✅ Backup complete!")
    print(f"   File: {backup_file}")
    print(f"   Documents: {doc_count:,}")
    print(f"   Compressed size: {file_size_mb:.2f} MB")
    
    return str(backup_file)


def main():
    parser = argparse.ArgumentParser(description='Backup Elasticsearch index')
    parser.add_argument('--index', '-i', help='Index name to backup')
    parser.add_argument('--host', '-H', help='Elasticsearch host URL')
    parser.add_argument('--output', '-o', help='Output directory for backup file')
    
    args = parser.parse_args()
    
    config = load_config()
    
    es_host = args.host or config['es_host']
    index = args.index or config['index']
    output_dir = args.output or config['backup_dir']
    
    print(f"""
╔══════════════════════════════════════════════════════════════════╗
║           ELASTICSEARCH INDEX BACKUP                              ║
╚══════════════════════════════════════════════════════════════════╝
    """)
    
    print(f"🔧 Configuration:")
    print(f"   Host: {es_host.split('@')[-1] if '@' in es_host else es_host}")
    print(f"   Index: {index}")
    print(f"   Output: {output_dir}")
    
    try:
        es = connect_elasticsearch(es_host)
        backup_file = backup_index(es, index, output_dir)
        print("\n🎉 Backup completed successfully!")
        return 0
        
    except Exception as e:
        print(f"\n❌ Error: {e}")
        return 1


if __name__ == '__main__':
    sys.exit(main())
