mistral-ocr/mistral_ocr/commands/convert.py

import json
import os
import sys
import re
import base64
import uuid
from pathlib import Path

def run(args):
    """
    Main entry point for the convert command.

    Converts OCR JSON results to Markdown format.

    Args:
        args: Command line arguments parsed by argparse
    """
    # If output file is specified, enable single file mode
    if args.output_file:
        args.single_file = True

    convert_json_to_markdown(args.json_file, args)

# OCR response structure classes
class OCRResponseImage:
    """
    Represents an image in the OCR response.

    Attributes:
        id (str): Unique identifier for the image
        image_base64 (str): Base64-encoded image data
    """
    def __init__(self, id, image_base64):
        self.id = id
        self.image_base64 = image_base64

class OCRResponsePage:
    """
    Represents a page in the OCR response.

    Attributes:
        index (int): Zero-based page index
        markdown (str): Extracted text content in Markdown format
        image (str, optional): Main page image (if available)
        images (list): List of OCRResponseImage objects
        dimensions (dict, optional): Page dimensions
    """
    def __init__(self, index, markdown, image=None, images=None, dimensions=None):
        self.index = index
        self.markdown = markdown
        self.image = image
        self.images = images or []
        self.dimensions = dimensions

class OCRResponseMetadata:
    """
    Represents metadata in the OCR response.

    Attributes:
        title (str, optional): Document title
        author (str, optional): Document author
        creation_date (str, optional): Document creation date
        page_count (int, optional): Total number of pages
    """
    def __init__(self, title=None, author=None, creation_date=None, page_count=None):
        self.title = title
        self.author = author
        self.creation_date = creation_date
        self.page_count = page_count

class OCRResponse:
    """
    Represents the complete OCR response.

    Attributes:
        pages (list): List of OCRResponsePage objects
        metadata (OCRResponseMetadata): Document metadata
    """
    def __init__(self, pages=None, metadata=None):
        self.pages = pages or []
        self.metadata = metadata or OCRResponseMetadata()

def extract_image_to_file(image_base64, image_id, image_dir):
    """
    Extract a base64-encoded image to a file.

    Args:
        image_base64 (str): Base64-encoded image data
        image_id (str): Unique identifier for the image
        image_dir (str): Directory to save the image

    Returns:
        str: Path to the saved image file (relative to the markdown file)
    """
    # Create the image directory if it doesn't exist
    os.makedirs(image_dir, exist_ok=True)

    # Clean up the base64 data if it has a data URL prefix
    if image_base64.startswith("data:"):
        # Extract the image type and base64 data
        match = re.match(r"data:image/(\w+);base64,(.+)", image_base64)
        if match:
            image_type, image_data = match.groups()
        else:
            # Default to JPEG if format can't be determined
            image_type = "jpeg"
            image_data = image_base64.split(",", 1)[1]
    else:
        # Default to JPEG if no data URL prefix
        image_type = "jpeg"
        image_data = image_base64

    # Generate a filename based on the image ID
    # Use a sanitized version of the image ID or a UUID if needed
    safe_id = re.sub(r'[^\w\-_]', '_', image_id) or str(uuid.uuid4())
    filename = f"{safe_id}.{image_type}"
    file_path = os.path.join(image_dir, filename)

    # Decode and save the image
    try:
        with open(file_path, 'wb') as f:
            f.write(base64.b64decode(image_data))
        return os.path.join(os.path.basename(image_dir), filename)
    except Exception as e:
        print(f"Warning: Failed to save image {image_id}: {e}", file=sys.stderr)
        return None

def replace_image_references(content, images, include_images, extract_images=False, image_dir=None):
    """
    Replace image references in markdown content with either base64 data or file references.

    Args:
        content (str): Markdown content with image references
        images (list): List of OCRResponseImage objects
        include_images (bool): Whether to include images in the output
        extract_images (bool, optional): Whether to extract images to files. Defaults to False.
        image_dir (str, optional): Directory to save extracted images. Defaults to None.

    Returns:
        str: Markdown content with image references replaced
    """
    if not include_images or not images:
        return content

    # Create a map of image IDs to their image data (either base64 or file path)
    image_map = {}
    for img in images:
        if img.image_base64:
            if extract_images and image_dir:
                # Extract the image to a file and use the file path
                img_data = img.image_base64
                if not img_data.startswith("data:"):
                    img_data = "data:image/jpeg;base64," + img_data

                file_path = extract_image_to_file(img_data, img.id, image_dir)
                if file_path:
                    image_map[img.id] = file_path
            else:
                # Use base64 data directly
                img_data = img.image_base64
                if not img_data.startswith("data:"):
                    img_data = "data:image/jpeg;base64," + img_data
                image_map[img.id] = img_data

    # Replace all image references
    for id, image_ref in image_map.items():
        # Escape special characters in the ID for regex
        escaped_id = re.escape(id)
        pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)"
        replacement = f"![{id}]({image_ref})"

        content = re.sub(pattern, replacement, content)

    return content

def convert_json_to_markdown(json_file, args):
    """
    Convert OCR JSON results to Markdown format.

    Args:
        json_file (str): Path to the JSON file containing OCR results
        args: Command line arguments containing conversion options

    Raises:
        SystemExit: If an error occurs during conversion
    """
    # Determine image directory if extracting images
    image_dir = None
    if args.extract_images and args.images:
        if args.image_dir:
            image_dir = args.image_dir
        else:
            # Default to output_dir/images
            image_dir = os.path.join(args.output_dir, "images")
    try:
        # Read JSON file
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Parse JSON into our structure
        ocr_response = OCRResponse()

        # Parse pages
        if "pages" in data:
            for page_data in data["pages"]:
                page = OCRResponsePage(
                    index=page_data.get("index", 0),
                    markdown=page_data.get("markdown", ""),
                    image=page_data.get("image", "")
                )

                # Parse images if present
                if "images" in page_data:
                    for img_data in page_data["images"]:
                        page.images.append(OCRResponseImage(
                            id=img_data.get("id", ""),
                            image_base64=img_data.get("image_base64", "")
                        ))

                ocr_response.pages.append(page)

        # Parse metadata
        if "metadata" in data:
            metadata = data["metadata"]
            ocr_response.metadata = OCRResponseMetadata(
                title=metadata.get("title", ""),
                author=metadata.get("author", ""),
                creation_date=metadata.get("creation_date", ""),
                page_count=metadata.get("page_count", 0)
            )

        # Create output directory if it doesn't exist
        os.makedirs(args.output_dir, exist_ok=True)

        if args.single_file:
            # Process all pages into a single markdown file
            combined = []
            title = "Document"

            # Use metadata title if available
            if ocr_response.metadata.title:
                title = ocr_response.metadata.title
            elif args.title_from_filename:
                # Use filename without extension
                title = Path(json_file).stem

            combined.append(f"# {title}\n")

            # Add metadata if available
            if (ocr_response.metadata.author or
                ocr_response.metadata.creation_date or
                ocr_response.metadata.page_count):
                combined.append("## Document Metadata\n")
                if ocr_response.metadata.author:
                    combined.append(f"**Author:** {ocr_response.metadata.author}\n")
                if ocr_response.metadata.creation_date:
                    combined.append(f"**Creation Date:** {ocr_response.metadata.creation_date}\n")
                if ocr_response.metadata.page_count:
                    combined.append(f"**Page Count:** {ocr_response.metadata.page_count}\n")
                combined.append("\n")

            # Process each page
            for i, page in enumerate(ocr_response.pages):
                # Add page header
                combined.append(f"## Page {page.index + 1}\n")

                # Convert page images to OCRResponseImage format
                page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]

                # Replace image references in markdown content if images is true
                page_content = page.markdown
                if args.images:
                    page_content = replace_image_references(
                        page_content,
                        page_images,
                        args.images,
                        args.extract_images,
                        image_dir
                    )

                # Add page content
                combined.append(page_content)
                combined.append("\n")

                # Add page separator if not the last page
                if args.page_breaks and i < len(ocr_response.pages) - 1:
                    combined.append("\n---\n")

            # Write combined markdown file
            # Use custom filename if provided, otherwise use default
            filename = "document.md"
            if args.output_file:
                # If output_file contains directory components, ensure they exist
                output_path = Path(args.output_dir) / args.output_file
                output_path.parent.mkdir(parents=True, exist_ok=True)
                filename = args.output_file
            else:
                output_path = Path(args.output_dir) / filename

            with open(output_path, 'w', encoding='utf-8') as f:
                f.write("\n".join(combined))

            print(f"Created single markdown file: {output_path}")
        else:
            # Process each page into a separate file
            for page in ocr_response.pages:
                # Use page index as the filename
                filename = f"{page.index}.md"
                output_path = Path(args.output_dir) / filename

                # Convert page images to OCRResponseImage format
                page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]

                # Get page content with image references replaced if needed
                markdown_content = page.markdown
                if args.images:
                    markdown_content = replace_image_references(
                        markdown_content,
                        page_images,
                        args.images,
                        args.extract_images,
                        image_dir
                    )

                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(markdown_content)

                print(f"Created markdown file: {output_path}")

        print(f"Successfully converted {json_file} to markdown files in {args.output_dir}/")
        print(f"Total pages: {len(ocr_response.pages)}")

    except Exception as e:
        print(f"Error converting JSON to markdown: {e}", file=sys.stderr)
        sys.exit(1)