Files
mistral-ocr/mistral_ocr/commands/convert.py
T

336 lines
12 KiB
Python

import json
import os
import sys
import re
import base64
import uuid
from pathlib import Path
def run(args):
"""
Main entry point for the convert command.
Converts OCR JSON results to Markdown format.
Args:
args: Command line arguments parsed by argparse
"""
# If output file is specified, enable single file mode
if args.output_file:
args.single_file = True
convert_json_to_markdown(args.json_file, args)
# OCR response structure classes
class OCRResponseImage:
"""
Represents an image in the OCR response.
Attributes:
id (str): Unique identifier for the image
image_base64 (str): Base64-encoded image data
"""
def __init__(self, id, image_base64):
self.id = id
self.image_base64 = image_base64
class OCRResponsePage:
"""
Represents a page in the OCR response.
Attributes:
index (int): Zero-based page index
markdown (str): Extracted text content in Markdown format
image (str, optional): Main page image (if available)
images (list): List of OCRResponseImage objects
dimensions (dict, optional): Page dimensions
"""
def __init__(self, index, markdown, image=None, images=None, dimensions=None):
self.index = index
self.markdown = markdown
self.image = image
self.images = images or []
self.dimensions = dimensions
class OCRResponseMetadata:
"""
Represents metadata in the OCR response.
Attributes:
title (str, optional): Document title
author (str, optional): Document author
creation_date (str, optional): Document creation date
page_count (int, optional): Total number of pages
"""
def __init__(self, title=None, author=None, creation_date=None, page_count=None):
self.title = title
self.author = author
self.creation_date = creation_date
self.page_count = page_count
class OCRResponse:
"""
Represents the complete OCR response.
Attributes:
pages (list): List of OCRResponsePage objects
metadata (OCRResponseMetadata): Document metadata
"""
def __init__(self, pages=None, metadata=None):
self.pages = pages or []
self.metadata = metadata or OCRResponseMetadata()
def extract_image_to_file(image_base64, image_id, image_dir):
"""
Extract a base64-encoded image to a file.
Args:
image_base64 (str): Base64-encoded image data
image_id (str): Unique identifier for the image
image_dir (str): Directory to save the image
Returns:
str: Path to the saved image file (relative to the markdown file)
"""
# Create the image directory if it doesn't exist
os.makedirs(image_dir, exist_ok=True)
# Clean up the base64 data if it has a data URL prefix
if image_base64.startswith("data:"):
# Extract the image type and base64 data
match = re.match(r"data:image/(\w+);base64,(.+)", image_base64)
if match:
image_type, image_data = match.groups()
else:
# Default to JPEG if format can't be determined
image_type = "jpeg"
image_data = image_base64.split(",", 1)[1]
else:
# Default to JPEG if no data URL prefix
image_type = "jpeg"
image_data = image_base64
# Generate a filename based on the image ID
# Use a sanitized version of the image ID or a UUID if needed
safe_id = re.sub(r'[^\w\-_]', '_', image_id) or str(uuid.uuid4())
filename = f"{safe_id}.{image_type}"
file_path = os.path.join(image_dir, filename)
# Decode and save the image
try:
with open(file_path, 'wb') as f:
f.write(base64.b64decode(image_data))
return os.path.join(os.path.basename(image_dir), filename)
except Exception as e:
print(f"Warning: Failed to save image {image_id}: {e}", file=sys.stderr)
return None
def replace_image_references(content, images, include_images, extract_images=False, image_dir=None):
"""
Replace image references in markdown content with either base64 data or file references.
Args:
content (str): Markdown content with image references
images (list): List of OCRResponseImage objects
include_images (bool): Whether to include images in the output
extract_images (bool, optional): Whether to extract images to files. Defaults to False.
image_dir (str, optional): Directory to save extracted images. Defaults to None.
Returns:
str: Markdown content with image references replaced
"""
if not include_images or not images:
return content
# Create a map of image IDs to their image data (either base64 or file path)
image_map = {}
for img in images:
if img.image_base64:
if extract_images and image_dir:
# Extract the image to a file and use the file path
img_data = img.image_base64
if not img_data.startswith("data:"):
img_data = "data:image/jpeg;base64," + img_data
file_path = extract_image_to_file(img_data, img.id, image_dir)
if file_path:
image_map[img.id] = file_path
else:
# Use base64 data directly
img_data = img.image_base64
if not img_data.startswith("data:"):
img_data = "data:image/jpeg;base64," + img_data
image_map[img.id] = img_data
# Replace all image references
for id, image_ref in image_map.items():
# Escape special characters in the ID for regex
escaped_id = re.escape(id)
pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)"
replacement = f"![{id}]({image_ref})"
content = re.sub(pattern, replacement, content)
return content
def convert_json_to_markdown(json_file, args):
"""
Convert OCR JSON results to Markdown format.
Args:
json_file (str): Path to the JSON file containing OCR results
args: Command line arguments containing conversion options
Raises:
SystemExit: If an error occurs during conversion
"""
# Determine image directory if extracting images
image_dir = None
if args.extract_images and args.images:
if args.image_dir:
image_dir = args.image_dir
else:
# Default to output_dir/images
image_dir = os.path.join(args.output_dir, "images")
try:
# Read JSON file
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Parse JSON into our structure
ocr_response = OCRResponse()
# Parse pages
if "pages" in data:
for page_data in data["pages"]:
page = OCRResponsePage(
index=page_data.get("index", 0),
markdown=page_data.get("markdown", ""),
image=page_data.get("image", "")
)
# Parse images if present
if "images" in page_data:
for img_data in page_data["images"]:
page.images.append(OCRResponseImage(
id=img_data.get("id", ""),
image_base64=img_data.get("image_base64", "")
))
ocr_response.pages.append(page)
# Parse metadata
if "metadata" in data:
metadata = data["metadata"]
ocr_response.metadata = OCRResponseMetadata(
title=metadata.get("title", ""),
author=metadata.get("author", ""),
creation_date=metadata.get("creation_date", ""),
page_count=metadata.get("page_count", 0)
)
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
if args.single_file:
# Process all pages into a single markdown file
combined = []
title = "Document"
# Use metadata title if available
if ocr_response.metadata.title:
title = ocr_response.metadata.title
elif args.title_from_filename:
# Use filename without extension
title = Path(json_file).stem
combined.append(f"# {title}\n")
# Add metadata if available
if (ocr_response.metadata.author or
ocr_response.metadata.creation_date or
ocr_response.metadata.page_count):
combined.append("## Document Metadata\n")
if ocr_response.metadata.author:
combined.append(f"**Author:** {ocr_response.metadata.author}\n")
if ocr_response.metadata.creation_date:
combined.append(f"**Creation Date:** {ocr_response.metadata.creation_date}\n")
if ocr_response.metadata.page_count:
combined.append(f"**Page Count:** {ocr_response.metadata.page_count}\n")
combined.append("\n")
# Process each page
for i, page in enumerate(ocr_response.pages):
# Add page header
combined.append(f"## Page {page.index + 1}\n")
# Convert page images to OCRResponseImage format
page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
# Replace image references in markdown content if images is true
page_content = page.markdown
if args.images:
page_content = replace_image_references(
page_content,
page_images,
args.images,
args.extract_images,
image_dir
)
# Add page content
combined.append(page_content)
combined.append("\n")
# Add page separator if not the last page
if args.page_breaks and i < len(ocr_response.pages) - 1:
combined.append("\n---\n")
# Write combined markdown file
# Use custom filename if provided, otherwise use default
filename = "document.md"
if args.output_file:
# If output_file contains directory components, ensure they exist
output_path = Path(args.output_dir) / args.output_file
output_path.parent.mkdir(parents=True, exist_ok=True)
filename = args.output_file
else:
output_path = Path(args.output_dir) / filename
with open(output_path, 'w', encoding='utf-8') as f:
f.write("\n".join(combined))
print(f"Created single markdown file: {output_path}")
else:
# Process each page into a separate file
for page in ocr_response.pages:
# Use page index as the filename
filename = f"{page.index}.md"
output_path = Path(args.output_dir) / filename
# Convert page images to OCRResponseImage format
page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
# Get page content with image references replaced if needed
markdown_content = page.markdown
if args.images:
markdown_content = replace_image_references(
markdown_content,
page_images,
args.images,
args.extract_images,
image_dir
)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
print(f"Created markdown file: {output_path}")
print(f"Successfully converted {json_file} to markdown files in {args.output_dir}/")
print(f"Total pages: {len(ocr_response.pages)}")
except Exception as e:
print(f"Error converting JSON to markdown: {e}", file=sys.stderr)
sys.exit(1)