import json import os import sys import re import base64 import uuid from pathlib import Path def run(args): """ Main entry point for the convert command. Converts OCR JSON results to Markdown format. Args: args: Command line arguments parsed by argparse """ # If output file is specified, enable single file mode if args.output_file: args.single_file = True convert_json_to_markdown(args.json_file, args) # OCR response structure classes class OCRResponseImage: """ Represents an image in the OCR response. Attributes: id (str): Unique identifier for the image image_base64 (str): Base64-encoded image data """ def __init__(self, id, image_base64): self.id = id self.image_base64 = image_base64 class OCRResponsePage: """ Represents a page in the OCR response. Attributes: index (int): Zero-based page index markdown (str): Extracted text content in Markdown format image (str, optional): Main page image (if available) images (list): List of OCRResponseImage objects dimensions (dict, optional): Page dimensions """ def __init__(self, index, markdown, image=None, images=None, dimensions=None): self.index = index self.markdown = markdown self.image = image self.images = images or [] self.dimensions = dimensions class OCRResponseMetadata: """ Represents metadata in the OCR response. Attributes: title (str, optional): Document title author (str, optional): Document author creation_date (str, optional): Document creation date page_count (int, optional): Total number of pages """ def __init__(self, title=None, author=None, creation_date=None, page_count=None): self.title = title self.author = author self.creation_date = creation_date self.page_count = page_count class OCRResponse: """ Represents the complete OCR response. Attributes: pages (list): List of OCRResponsePage objects metadata (OCRResponseMetadata): Document metadata """ def __init__(self, pages=None, metadata=None): self.pages = pages or [] self.metadata = metadata or OCRResponseMetadata() def extract_image_to_file(image_base64, image_id, image_dir): """ Extract a base64-encoded image to a file. Args: image_base64 (str): Base64-encoded image data image_id (str): Unique identifier for the image image_dir (str): Directory to save the image Returns: str: Path to the saved image file (relative to the markdown file) """ # Create the image directory if it doesn't exist os.makedirs(image_dir, exist_ok=True) # Clean up the base64 data if it has a data URL prefix if image_base64.startswith("data:"): # Extract the image type and base64 data match = re.match(r"data:image/(\w+);base64,(.+)", image_base64) if match: image_type, image_data = match.groups() else: # Default to JPEG if format can't be determined image_type = "jpeg" image_data = image_base64.split(",", 1)[1] else: # Default to JPEG if no data URL prefix image_type = "jpeg" image_data = image_base64 # Generate a filename based on the image ID # Use a sanitized version of the image ID or a UUID if needed safe_id = re.sub(r'[^\w\-_]', '_', image_id) or str(uuid.uuid4()) filename = f"{safe_id}.{image_type}" file_path = os.path.join(image_dir, filename) # Decode and save the image try: with open(file_path, 'wb') as f: f.write(base64.b64decode(image_data)) return os.path.join(os.path.basename(image_dir), filename) except Exception as e: print(f"Warning: Failed to save image {image_id}: {e}", file=sys.stderr) return None def replace_image_references(content, images, include_images, extract_images=False, image_dir=None): """ Replace image references in markdown content with either base64 data or file references. Args: content (str): Markdown content with image references images (list): List of OCRResponseImage objects include_images (bool): Whether to include images in the output extract_images (bool, optional): Whether to extract images to files. Defaults to False. image_dir (str, optional): Directory to save extracted images. Defaults to None. Returns: str: Markdown content with image references replaced """ if not include_images or not images: return content # Create a map of image IDs to their image data (either base64 or file path) image_map = {} for img in images: if img.image_base64: if extract_images and image_dir: # Extract the image to a file and use the file path img_data = img.image_base64 if not img_data.startswith("data:"): img_data = "data:image/jpeg;base64," + img_data file_path = extract_image_to_file(img_data, img.id, image_dir) if file_path: image_map[img.id] = file_path else: # Use base64 data directly img_data = img.image_base64 if not img_data.startswith("data:"): img_data = "data:image/jpeg;base64," + img_data image_map[img.id] = img_data # Replace all image references for id, image_ref in image_map.items(): # Escape special characters in the ID for regex escaped_id = re.escape(id) pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)" replacement = f"![{id}]({image_ref})" content = re.sub(pattern, replacement, content) return content def convert_json_to_markdown(json_file, args): """ Convert OCR JSON results to Markdown format. Args: json_file (str): Path to the JSON file containing OCR results args: Command line arguments containing conversion options Raises: SystemExit: If an error occurs during conversion """ # Determine image directory if extracting images image_dir = None if args.extract_images and args.images: if args.image_dir: image_dir = args.image_dir else: # Default to output_dir/images image_dir = os.path.join(args.output_dir, "images") try: # Read JSON file with open(json_file, 'r', encoding='utf-8') as f: data = json.load(f) # Parse JSON into our structure ocr_response = OCRResponse() # Parse pages if "pages" in data: for page_data in data["pages"]: page = OCRResponsePage( index=page_data.get("index", 0), markdown=page_data.get("markdown", ""), image=page_data.get("image", "") ) # Parse images if present if "images" in page_data: for img_data in page_data["images"]: page.images.append(OCRResponseImage( id=img_data.get("id", ""), image_base64=img_data.get("image_base64", "") )) ocr_response.pages.append(page) # Parse metadata if "metadata" in data: metadata = data["metadata"] ocr_response.metadata = OCRResponseMetadata( title=metadata.get("title", ""), author=metadata.get("author", ""), creation_date=metadata.get("creation_date", ""), page_count=metadata.get("page_count", 0) ) # Create output directory if it doesn't exist os.makedirs(args.output_dir, exist_ok=True) if args.single_file: # Process all pages into a single markdown file combined = [] title = "Document" # Use metadata title if available if ocr_response.metadata.title: title = ocr_response.metadata.title elif args.title_from_filename: # Use filename without extension title = Path(json_file).stem combined.append(f"# {title}\n") # Add metadata if available if (ocr_response.metadata.author or ocr_response.metadata.creation_date or ocr_response.metadata.page_count): combined.append("## Document Metadata\n") if ocr_response.metadata.author: combined.append(f"**Author:** {ocr_response.metadata.author}\n") if ocr_response.metadata.creation_date: combined.append(f"**Creation Date:** {ocr_response.metadata.creation_date}\n") if ocr_response.metadata.page_count: combined.append(f"**Page Count:** {ocr_response.metadata.page_count}\n") combined.append("\n") # Process each page for i, page in enumerate(ocr_response.pages): # Add page header combined.append(f"## Page {page.index + 1}\n") # Convert page images to OCRResponseImage format page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images] # Replace image references in markdown content if images is true page_content = page.markdown if args.images: page_content = replace_image_references( page_content, page_images, args.images, args.extract_images, image_dir ) # Add page content combined.append(page_content) combined.append("\n") # Add page separator if not the last page if args.page_breaks and i < len(ocr_response.pages) - 1: combined.append("\n---\n") # Write combined markdown file # Use custom filename if provided, otherwise use default filename = "document.md" if args.output_file: # If output_file contains directory components, ensure they exist output_path = Path(args.output_dir) / args.output_file output_path.parent.mkdir(parents=True, exist_ok=True) filename = args.output_file else: output_path = Path(args.output_dir) / filename with open(output_path, 'w', encoding='utf-8') as f: f.write("\n".join(combined)) print(f"Created single markdown file: {output_path}") else: # Process each page into a separate file for page in ocr_response.pages: # Use page index as the filename filename = f"{page.index}.md" output_path = Path(args.output_dir) / filename # Convert page images to OCRResponseImage format page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images] # Get page content with image references replaced if needed markdown_content = page.markdown if args.images: markdown_content = replace_image_references( markdown_content, page_images, args.images, args.extract_images, image_dir ) with open(output_path, 'w', encoding='utf-8') as f: f.write(markdown_content) print(f"Created markdown file: {output_path}") print(f"Successfully converted {json_file} to markdown files in {args.output_dir}/") print(f"Total pages: {len(ocr_response.pages)}") except Exception as e: print(f"Error converting JSON to markdown: {e}", file=sys.stderr) sys.exit(1)