mistral-ocr/mistral_ocr/commands/convert.py

import json
import os
import sys
import re
from pathlib import Path

def run(args):
    """
    Main entry point for the convert command.

    Converts OCR JSON results to Markdown format.

    Args:
        args: Command line arguments parsed by argparse
    """
    # If output file is specified, enable single file mode
    if args.output_file:
        args.single_file = True

    convert_json_to_markdown(args.json_file, args)

# OCR response structure classes
class OCRResponseImage:
    """
    Represents an image in the OCR response.

    Attributes:
        id (str): Unique identifier for the image
        image_base64 (str): Base64-encoded image data
    """
    def __init__(self, id, image_base64):
        self.id = id
        self.image_base64 = image_base64

class OCRResponsePage:
    """
    Represents a page in the OCR response.

    Attributes:
        index (int): Zero-based page index
        markdown (str): Extracted text content in Markdown format
        image (str, optional): Main page image (if available)
        images (list): List of OCRResponseImage objects
        dimensions (dict, optional): Page dimensions
    """
    def __init__(self, index, markdown, image=None, images=None, dimensions=None):
        self.index = index
        self.markdown = markdown
        self.image = image
        self.images = images or []
        self.dimensions = dimensions

class OCRResponseMetadata:
    """
    Represents metadata in the OCR response.

    Attributes:
        title (str, optional): Document title
        author (str, optional): Document author
        creation_date (str, optional): Document creation date
        page_count (int, optional): Total number of pages
    """
    def __init__(self, title=None, author=None, creation_date=None, page_count=None):
        self.title = title
        self.author = author
        self.creation_date = creation_date
        self.page_count = page_count

class OCRResponse:
    """
    Represents the complete OCR response.

    Attributes:
        pages (list): List of OCRResponsePage objects
        metadata (OCRResponseMetadata): Document metadata
    """
    def __init__(self, pages=None, metadata=None):
        self.pages = pages or []
        self.metadata = metadata or OCRResponseMetadata()

def replace_image_references(content, images, include_images):
    """
    Replace image references in markdown content with base64 data.

    Args:
        content (str): Markdown content with image references
        images (list): List of OCRResponseImage objects
        include_images (bool): Whether to include images in the output

    Returns:
        str: Markdown content with image references replaced with base64 data
    """
    if not include_images or not images:
        return content

    # Create a map of image IDs to their base64 data
    image_map = {}
    for img in images:
        if img.image_base64:
            img_data = img.image_base64
            if not img_data.startswith("data:"):
                img_data = "data:image/jpeg;base64," + img_data
            image_map[img.id] = img_data

    # Replace all image references with base64 data
    for id, base64_data in image_map.items():
        # Escape special characters in the ID for regex
        escaped_id = re.escape(id)
        pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)"
        replacement = f"![{id}]({base64_data})"

        content = re.sub(pattern, replacement, content)

    return content

def convert_json_to_markdown(json_file, args):
    """
    Convert OCR JSON results to Markdown format.

    Args:
        json_file (str): Path to the JSON file containing OCR results
        args: Command line arguments containing conversion options

    Raises:
        SystemExit: If an error occurs during conversion
    """
    try:
        # Read JSON file
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Parse JSON into our structure
        ocr_response = OCRResponse()

        # Parse pages
        if "pages" in data:
            for page_data in data["pages"]:
                page = OCRResponsePage(
                    index=page_data.get("index", 0),
                    markdown=page_data.get("markdown", ""),
                    image=page_data.get("image", "")
                )

                # Parse images if present
                if "images" in page_data:
                    for img_data in page_data["images"]:
                        page.images.append(OCRResponseImage(
                            id=img_data.get("id", ""),
                            image_base64=img_data.get("image_base64", "")
                        ))

                ocr_response.pages.append(page)

        # Parse metadata
        if "metadata" in data:
            metadata = data["metadata"]
            ocr_response.metadata = OCRResponseMetadata(
                title=metadata.get("title", ""),
                author=metadata.get("author", ""),
                creation_date=metadata.get("creation_date", ""),
                page_count=metadata.get("page_count", 0)
            )

        # Create output directory if it doesn't exist
        os.makedirs(args.output_dir, exist_ok=True)

        if args.single_file:
            # Process all pages into a single markdown file
            combined = []
            title = "Document"

            # Use metadata title if available
            if ocr_response.metadata.title:
                title = ocr_response.metadata.title
            elif args.title_from_filename:
                # Use filename without extension
                title = Path(json_file).stem

            combined.append(f"# {title}\n")

            # Add metadata if available
            if (ocr_response.metadata.author or
                ocr_response.metadata.creation_date or
                ocr_response.metadata.page_count):
                combined.append("## Document Metadata\n")
                if ocr_response.metadata.author:
                    combined.append(f"**Author:** {ocr_response.metadata.author}\n")
                if ocr_response.metadata.creation_date:
                    combined.append(f"**Creation Date:** {ocr_response.metadata.creation_date}\n")
                if ocr_response.metadata.page_count:
                    combined.append(f"**Page Count:** {ocr_response.metadata.page_count}\n")
                combined.append("\n")

            # Process each page
            for i, page in enumerate(ocr_response.pages):
                # Add page header
                combined.append(f"## Page {page.index + 1}\n")

                # Convert page images to OCRResponseImage format
                page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]

                # Replace image references in markdown content if includeImages is true
                page_content = page.markdown
                if args.images:
                    page_content = replace_image_references(page_content, page_images, args.images)

                # Add page content
                combined.append(page_content)
                combined.append("\n")

                # Add page separator if not the last page
                if args.page_breaks and i < len(ocr_response.pages) - 1:
                    combined.append("\n---\n")

            # Write combined markdown file
            # Use custom filename if provided, otherwise use default
            filename = "document.md"
            if args.output_file:
                # If output_file contains directory components, ensure they exist
                output_path = Path(args.output_dir) / args.output_file
                output_path.parent.mkdir(parents=True, exist_ok=True)
                filename = args.output_file
            else:
                output_path = Path(args.output_dir) / filename

            with open(output_path, 'w', encoding='utf-8') as f:
                f.write("\n".join(combined))

            print(f"Created single markdown file: {output_path}")
        else:
            # Process each page into a separate file
            for page in ocr_response.pages:
                # Use page index as the filename
                filename = f"{page.index}.md"
                output_path = Path(args.output_dir) / filename

                # Convert page images to OCRResponseImage format
                page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]

                # Get page content with image references replaced if needed
                markdown_content = page.markdown
                if args.images:
                    markdown_content = replace_image_references(markdown_content, page_images, args.images)

                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(markdown_content)

                print(f"Created markdown file: {output_path}")

        print(f"Successfully converted {json_file} to markdown files in {args.output_dir}/")
        print(f"Total pages: {len(ocr_response.pages)}")

    except Exception as e:
        print(f"Error converting JSON to markdown: {e}", file=sys.stderr)
        sys.exit(1)