import json
import os
import sys
import re
from pathlib import Path

def run(args):
    """
    Main entry point for the convert command.
    
    Converts OCR JSON results to Markdown format.
    
    Args:
        args: Command line arguments parsed by argparse
    """
    # If output file is specified, enable single file mode
    if args.output_file:
        args.single_file = True
    
    convert_json_to_markdown(args.json_file, args)

# OCR response structure classes
class OCRResponseImage:
    """
    Represents an image in the OCR response.
    
    Attributes:
        id (str): Unique identifier for the image
        image_base64 (str): Base64-encoded image data
    """
    def __init__(self, id, image_base64):
        self.id = id
        self.image_base64 = image_base64

class OCRResponsePage:
    """
    Represents a page in the OCR response.
    
    Attributes:
        index (int): Zero-based page index
        markdown (str): Extracted text content in Markdown format
        image (str, optional): Main page image (if available)
        images (list): List of OCRResponseImage objects
        dimensions (dict, optional): Page dimensions
    """
    def __init__(self, index, markdown, image=None, images=None, dimensions=None):
        self.index = index
        self.markdown = markdown
        self.image = image
        self.images = images or []
        self.dimensions = dimensions

class OCRResponseMetadata:
    """
    Represents metadata in the OCR response.
    
    Attributes:
        title (str, optional): Document title
        author (str, optional): Document author
        creation_date (str, optional): Document creation date
        page_count (int, optional): Total number of pages
    """
    def __init__(self, title=None, author=None, creation_date=None, page_count=None):
        self.title = title
        self.author = author
        self.creation_date = creation_date
        self.page_count = page_count

class OCRResponse:
    """
    Represents the complete OCR response.
    
    Attributes:
        pages (list): List of OCRResponsePage objects
        metadata (OCRResponseMetadata): Document metadata
    """
    def __init__(self, pages=None, metadata=None):
        self.pages = pages or []
        self.metadata = metadata or OCRResponseMetadata()

def replace_image_references(content, images, include_images):
    """
    Replace image references in markdown content with base64 data.
    
    Args:
        content (str): Markdown content with image references
        images (list): List of OCRResponseImage objects
        include_images (bool): Whether to include images in the output
        
    Returns:
        str: Markdown content with image references replaced with base64 data
    """
    if not include_images or not images:
        return content
    
    # Create a map of image IDs to their base64 data
    image_map = {}
    for img in images:
        if img.image_base64:
            img_data = img.image_base64
            if not img_data.startswith("data:"):
                img_data = "data:image/jpeg;base64," + img_data
            image_map[img.id] = img_data
    
    # Replace all image references with base64 data
    for id, base64_data in image_map.items():
        # Escape special characters in the ID for regex
        escaped_id = re.escape(id)
        pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)"
        replacement = f"![{id}]({base64_data})"
        
        content = re.sub(pattern, replacement, content)
    
    return content

def convert_json_to_markdown(json_file, args):
    """
    Convert OCR JSON results to Markdown format.
    
    Args:
        json_file (str): Path to the JSON file containing OCR results
        args: Command line arguments containing conversion options
        
    Raises:
        SystemExit: If an error occurs during conversion
    """
    try:
        # Read JSON file
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Parse JSON into our structure
        ocr_response = OCRResponse()
        
        # Parse pages
        if "pages" in data:
            for page_data in data["pages"]:
                page = OCRResponsePage(
                    index=page_data.get("index", 0),
                    markdown=page_data.get("markdown", ""),
                    image=page_data.get("image", "")
                )
                
                # Parse images if present
                if "images" in page_data:
                    for img_data in page_data["images"]:
                        page.images.append(OCRResponseImage(
                            id=img_data.get("id", ""),
                            image_base64=img_data.get("image_base64", "")
                        ))
                
                ocr_response.pages.append(page)
        
        # Parse metadata
        if "metadata" in data:
            metadata = data["metadata"]
            ocr_response.metadata = OCRResponseMetadata(
                title=metadata.get("title", ""),
                author=metadata.get("author", ""),
                creation_date=metadata.get("creation_date", ""),
                page_count=metadata.get("page_count", 0)
            )
        
        # Create output directory if it doesn't exist
        os.makedirs(args.output_dir, exist_ok=True)
        
        if args.single_file:
            # Process all pages into a single markdown file
            combined = []
            title = "Document"
            
            # Use metadata title if available
            if ocr_response.metadata.title:
                title = ocr_response.metadata.title
            elif args.title_from_filename:
                # Use filename without extension
                title = Path(json_file).stem
            
            combined.append(f"# {title}\n")
            
            # Add metadata if available
            if (ocr_response.metadata.author or 
                ocr_response.metadata.creation_date or 
                ocr_response.metadata.page_count):
                combined.append("## Document Metadata\n")
                if ocr_response.metadata.author:
                    combined.append(f"**Author:** {ocr_response.metadata.author}\n")
                if ocr_response.metadata.creation_date:
                    combined.append(f"**Creation Date:** {ocr_response.metadata.creation_date}\n")
                if ocr_response.metadata.page_count:
                    combined.append(f"**Page Count:** {ocr_response.metadata.page_count}\n")
                combined.append("\n")
            
            # Process each page
            for i, page in enumerate(ocr_response.pages):
                # Add page header
                combined.append(f"## Page {page.index + 1}\n")
                
                # Convert page images to OCRResponseImage format
                page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
                
                # Replace image references in markdown content if includeImages is true
                page_content = page.markdown
                if args.images:
                    page_content = replace_image_references(page_content, page_images, args.images)
                
                # Add page content
                combined.append(page_content)
                combined.append("\n")
                
                # Add page separator if not the last page
                if args.page_breaks and i < len(ocr_response.pages) - 1:
                    combined.append("\n---\n")
            
            # Write combined markdown file
            # Use custom filename if provided, otherwise use default
            filename = "document.md"
            if args.output_file:
                # If output_file contains directory components, ensure they exist
                output_path = Path(args.output_dir) / args.output_file
                output_path.parent.mkdir(parents=True, exist_ok=True)
                filename = args.output_file
            else:
                output_path = Path(args.output_dir) / filename
            
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write("\n".join(combined))
            
            print(f"Created single markdown file: {output_path}")
        else:
            # Process each page into a separate file
            for page in ocr_response.pages:
                # Use page index as the filename
                filename = f"{page.index}.md"
                output_path = Path(args.output_dir) / filename
                
                # Convert page images to OCRResponseImage format
                page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
                
                # Get page content with image references replaced if needed
                markdown_content = page.markdown
                if args.images:
                    markdown_content = replace_image_references(markdown_content, page_images, args.images)
                
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(markdown_content)
                
                print(f"Created markdown file: {output_path}")
        
        print(f"Successfully converted {json_file} to markdown files in {args.output_dir}/")
        print(f"Total pages: {len(ocr_response.pages)}")
        
    except Exception as e:
        print(f"Error converting JSON to markdown: {e}", file=sys.stderr)
        sys.exit(1)