5e891ef461
This commit adds extensive documentation to the Mistral OCR CLI project: - Add API.md with detailed API response format documentation - Add CHANGELOG.md to track version changes - Add CONTRIBUTING.md with guidelines for contributors - Enhance README.md with more detailed usage examples and troubleshooting - Add proper docstrings to all Python modules and functions - Update requirements.txt with development dependencies - Improve setup.py with better metadata These changes make the project more accessible to users and contributors.
256 lines
9.4 KiB
Python
256 lines
9.4 KiB
Python
import json
|
|
import os
|
|
import sys
|
|
import re
|
|
from pathlib import Path
|
|
|
|
def run(args):
|
|
"""
|
|
Main entry point for the convert command.
|
|
|
|
Converts OCR JSON results to Markdown format.
|
|
|
|
Args:
|
|
args: Command line arguments parsed by argparse
|
|
"""
|
|
# If output file is specified, enable single file mode
|
|
if args.output_file:
|
|
args.single_file = True
|
|
|
|
convert_json_to_markdown(args.json_file, args)
|
|
|
|
# OCR response structure classes
|
|
class OCRResponseImage:
|
|
"""
|
|
Represents an image in the OCR response.
|
|
|
|
Attributes:
|
|
id (str): Unique identifier for the image
|
|
image_base64 (str): Base64-encoded image data
|
|
"""
|
|
def __init__(self, id, image_base64):
|
|
self.id = id
|
|
self.image_base64 = image_base64
|
|
|
|
class OCRResponsePage:
|
|
"""
|
|
Represents a page in the OCR response.
|
|
|
|
Attributes:
|
|
index (int): Zero-based page index
|
|
markdown (str): Extracted text content in Markdown format
|
|
image (str, optional): Main page image (if available)
|
|
images (list): List of OCRResponseImage objects
|
|
dimensions (dict, optional): Page dimensions
|
|
"""
|
|
def __init__(self, index, markdown, image=None, images=None, dimensions=None):
|
|
self.index = index
|
|
self.markdown = markdown
|
|
self.image = image
|
|
self.images = images or []
|
|
self.dimensions = dimensions
|
|
|
|
class OCRResponseMetadata:
|
|
"""
|
|
Represents metadata in the OCR response.
|
|
|
|
Attributes:
|
|
title (str, optional): Document title
|
|
author (str, optional): Document author
|
|
creation_date (str, optional): Document creation date
|
|
page_count (int, optional): Total number of pages
|
|
"""
|
|
def __init__(self, title=None, author=None, creation_date=None, page_count=None):
|
|
self.title = title
|
|
self.author = author
|
|
self.creation_date = creation_date
|
|
self.page_count = page_count
|
|
|
|
class OCRResponse:
|
|
"""
|
|
Represents the complete OCR response.
|
|
|
|
Attributes:
|
|
pages (list): List of OCRResponsePage objects
|
|
metadata (OCRResponseMetadata): Document metadata
|
|
"""
|
|
def __init__(self, pages=None, metadata=None):
|
|
self.pages = pages or []
|
|
self.metadata = metadata or OCRResponseMetadata()
|
|
|
|
def replace_image_references(content, images, include_images):
|
|
"""
|
|
Replace image references in markdown content with base64 data.
|
|
|
|
Args:
|
|
content (str): Markdown content with image references
|
|
images (list): List of OCRResponseImage objects
|
|
include_images (bool): Whether to include images in the output
|
|
|
|
Returns:
|
|
str: Markdown content with image references replaced with base64 data
|
|
"""
|
|
if not include_images or not images:
|
|
return content
|
|
|
|
# Create a map of image IDs to their base64 data
|
|
image_map = {}
|
|
for img in images:
|
|
if img.image_base64:
|
|
img_data = img.image_base64
|
|
if not img_data.startswith("data:"):
|
|
img_data = "data:image/jpeg;base64," + img_data
|
|
image_map[img.id] = img_data
|
|
|
|
# Replace all image references with base64 data
|
|
for id, base64_data in image_map.items():
|
|
# Escape special characters in the ID for regex
|
|
escaped_id = re.escape(id)
|
|
pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)"
|
|
replacement = f""
|
|
|
|
content = re.sub(pattern, replacement, content)
|
|
|
|
return content
|
|
|
|
def convert_json_to_markdown(json_file, args):
|
|
"""
|
|
Convert OCR JSON results to Markdown format.
|
|
|
|
Args:
|
|
json_file (str): Path to the JSON file containing OCR results
|
|
args: Command line arguments containing conversion options
|
|
|
|
Raises:
|
|
SystemExit: If an error occurs during conversion
|
|
"""
|
|
try:
|
|
# Read JSON file
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Parse JSON into our structure
|
|
ocr_response = OCRResponse()
|
|
|
|
# Parse pages
|
|
if "pages" in data:
|
|
for page_data in data["pages"]:
|
|
page = OCRResponsePage(
|
|
index=page_data.get("index", 0),
|
|
markdown=page_data.get("markdown", ""),
|
|
image=page_data.get("image", "")
|
|
)
|
|
|
|
# Parse images if present
|
|
if "images" in page_data:
|
|
for img_data in page_data["images"]:
|
|
page.images.append(OCRResponseImage(
|
|
id=img_data.get("id", ""),
|
|
image_base64=img_data.get("image_base64", "")
|
|
))
|
|
|
|
ocr_response.pages.append(page)
|
|
|
|
# Parse metadata
|
|
if "metadata" in data:
|
|
metadata = data["metadata"]
|
|
ocr_response.metadata = OCRResponseMetadata(
|
|
title=metadata.get("title", ""),
|
|
author=metadata.get("author", ""),
|
|
creation_date=metadata.get("creation_date", ""),
|
|
page_count=metadata.get("page_count", 0)
|
|
)
|
|
|
|
# Create output directory if it doesn't exist
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
|
|
|
if args.single_file:
|
|
# Process all pages into a single markdown file
|
|
combined = []
|
|
title = "Document"
|
|
|
|
# Use metadata title if available
|
|
if ocr_response.metadata.title:
|
|
title = ocr_response.metadata.title
|
|
elif args.title_from_filename:
|
|
# Use filename without extension
|
|
title = Path(json_file).stem
|
|
|
|
combined.append(f"# {title}\n")
|
|
|
|
# Add metadata if available
|
|
if (ocr_response.metadata.author or
|
|
ocr_response.metadata.creation_date or
|
|
ocr_response.metadata.page_count):
|
|
combined.append("## Document Metadata\n")
|
|
if ocr_response.metadata.author:
|
|
combined.append(f"**Author:** {ocr_response.metadata.author}\n")
|
|
if ocr_response.metadata.creation_date:
|
|
combined.append(f"**Creation Date:** {ocr_response.metadata.creation_date}\n")
|
|
if ocr_response.metadata.page_count:
|
|
combined.append(f"**Page Count:** {ocr_response.metadata.page_count}\n")
|
|
combined.append("\n")
|
|
|
|
# Process each page
|
|
for i, page in enumerate(ocr_response.pages):
|
|
# Add page header
|
|
combined.append(f"## Page {page.index + 1}\n")
|
|
|
|
# Convert page images to OCRResponseImage format
|
|
page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
|
|
|
|
# Replace image references in markdown content if includeImages is true
|
|
page_content = page.markdown
|
|
if args.images:
|
|
page_content = replace_image_references(page_content, page_images, args.images)
|
|
|
|
# Add page content
|
|
combined.append(page_content)
|
|
combined.append("\n")
|
|
|
|
# Add page separator if not the last page
|
|
if args.page_breaks and i < len(ocr_response.pages) - 1:
|
|
combined.append("\n---\n")
|
|
|
|
# Write combined markdown file
|
|
# Use custom filename if provided, otherwise use default
|
|
filename = "document.md"
|
|
if args.output_file:
|
|
# If output_file contains directory components, ensure they exist
|
|
output_path = Path(args.output_dir) / args.output_file
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
filename = args.output_file
|
|
else:
|
|
output_path = Path(args.output_dir) / filename
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write("\n".join(combined))
|
|
|
|
print(f"Created single markdown file: {output_path}")
|
|
else:
|
|
# Process each page into a separate file
|
|
for page in ocr_response.pages:
|
|
# Use page index as the filename
|
|
filename = f"{page.index}.md"
|
|
output_path = Path(args.output_dir) / filename
|
|
|
|
# Convert page images to OCRResponseImage format
|
|
page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
|
|
|
|
# Get page content with image references replaced if needed
|
|
markdown_content = page.markdown
|
|
if args.images:
|
|
markdown_content = replace_image_references(markdown_content, page_images, args.images)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(markdown_content)
|
|
|
|
print(f"Created markdown file: {output_path}")
|
|
|
|
print(f"Successfully converted {json_file} to markdown files in {args.output_dir}/")
|
|
print(f"Total pages: {len(ocr_response.pages)}")
|
|
|
|
except Exception as e:
|
|
print(f"Error converting JSON to markdown: {e}", file=sys.stderr)
|
|
sys.exit(1)
|