From 220864d52f54520494052f77dd4989b19445c6d7 Mon Sep 17 00:00:00 2001 From: Heiko Joerg Schick Date: Thu, 24 Apr 2025 21:44:49 +0200 Subject: [PATCH] Add feature to extract images as separate files --- API.md | 67 ++++++++++++++++++- CHANGELOG.md | 1 + README.md | 30 +++++++++ mistral_ocr/__main__.py | 4 ++ mistral_ocr/commands/convert.py | 108 +++++++++++++++++++++++++++---- mistral_ocr/commands/markdown.py | 4 +- 6 files changed, 197 insertions(+), 17 deletions(-) diff --git a/API.md b/API.md index 6d06b2b..bb4266a 100644 --- a/API.md +++ b/API.md @@ -13,6 +13,8 @@ This document provides detailed information about the Mistral OCR API response f - [Working with the API Response](#working-with-the-api-response) - [Parsing the JSON Response](#parsing-the-json-response) - [Handling Images](#handling-images) + - [1. Embedded Images](#1-embedded-images) + - [2. Extracted Images](#2-extracted-images) - [Working with Markdown Content](#working-with-markdown-content) - [Error Handling](#error-handling) - [API Key Errors](#api-key-errors) @@ -113,7 +115,11 @@ for page in ocr_data.get('pages', []): ### Handling Images -If you've included images in the response (using the `--include-images` flag), you can extract and save them: +The Mistral OCR CLI provides two approaches for handling images: + +#### 1. Embedded Images + +When using the `--images` flag without `--extract-images`, images are embedded directly in the markdown as base64 data. If you've included images in the response (using the `--include-images` flag), you can extract and save them manually: ```python import base64 @@ -141,6 +147,65 @@ for page in ocr_data.get('pages', []): img_file.write(img_bytes) ``` +#### 2. Extracted Images + +Alternatively, you can use the `--extract-images` flag with the CLI to automatically extract images to separate files. This approach: + +- Saves each image as a separate file in the specified directory (or `output_dir/images` by default) +- Updates the markdown to reference these image files instead of embedding base64 data +- Results in smaller, more manageable markdown files + +Example command: +```bash +mistral-ocr markdown document.pdf --images --extract-images --image-dir custom_images +``` + +If you're working with the API directly and want to implement similar functionality, here's how you might do it: + +```python +import base64 +import os +import re + +def extract_images_from_ocr_data(ocr_data, image_dir='images'): + """Extract images from OCR data and update markdown references.""" + # Create image directory + os.makedirs(image_dir, exist_ok=True) + + # Process each page + for page in ocr_data.get('pages', []): + page_index = page.get('index', 0) + markdown = page.get('markdown', '') + + # Extract and save images + for img_index, image in enumerate(page.get('images', [])): + img_id = image.get('id', f'unknown-{img_index}') + img_data = image.get('image_base64', '') + + if img_data: + # Generate filename + filename = f"{img_id.replace(' ', '_')}.jpg" + filepath = os.path.join(image_dir, filename) + + # Remove data URL prefix if present + if ',' in img_data: + img_data = img_data.split(',', 1)[1] + + # Save the image + with open(filepath, 'wb') as img_file: + img_file.write(base64.b64decode(img_data)) + + # Update markdown to reference the file + pattern = f"!\\[{re.escape(img_id)}\\]\\({re.escape(img_id)}\\)" + replacement = f"![{img_id}]({os.path.join(os.path.basename(image_dir), filename)})" + markdown = re.sub(pattern, replacement, markdown) + + # Update the page's markdown + page['markdown'] = markdown + + return ocr_data +``` + ### Working with Markdown Content The OCR results are provided in Markdown format, which makes it easy to convert to other formats or display in applications: diff --git a/CHANGELOG.md b/CHANGELOG.md index ce95f6b..56a9985 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Enhanced documentation including README.md, CONTRIBUTING.md, and CHANGELOG.md - More detailed troubleshooting section - API response format documentation +- Option to extract images to separate files instead of embedding them in markdown ## [0.1.0] - 2025-04-24 diff --git a/README.md b/README.md index b19c21c..ddd6d98 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,12 @@ mistral-ocr convert results.json --output-file document.md # Include images in markdown (if available in JSON) mistral-ocr convert results.json --images + +# Extract images to files instead of embedding them in markdown +mistral-ocr convert results.json --images --extract-images + +# Specify a custom directory for extracted images +mistral-ocr convert results.json --images --extract-images --image-dir images_folder ``` #### Process and Convert in One Step @@ -134,6 +140,12 @@ mistral-ocr markdown path/to/document.pdf --output-file docs/result.md # Save intermediate JSON and generate markdown files mistral-ocr markdown path/to/document.pdf --json-file results.json --output-dir docs + +# Extract images to files instead of embedding them in markdown +mistral-ocr markdown path/to/document.pdf --images --extract-images + +# Specify a custom directory for extracted images +mistral-ocr markdown path/to/document.pdf --images --extract-images --image-dir custom_images ``` This command combines the `process` and `convert` steps, creating markdown files directly from the document. @@ -182,8 +194,26 @@ mistral-ocr markdown ~/Documents/research-paper.pdf --single-file --output-dir r # Generate a single markdown file with specific filename mistral-ocr markdown ~/Documents/research-paper.pdf --output-file research_docs/paper.md + +# Process a document and extract images to separate files +mistral-ocr markdown ~/Documents/research-paper.pdf --images --extract-images --output-dir research_docs ``` +## Image Handling + +The tool provides several options for handling images in the OCR output: + +1. **No images**: By default, images are not included in the output. + +2. **Embedded images**: Using the `--images` flag without `--extract-images` will embed base64-encoded images directly in the markdown file. This creates a self-contained document but can result in very large files. + +3. **Extracted images**: Using both `--images` and `--extract-images` flags will: + - Extract images from the OCR results + - Save them as separate files in an images directory + - Reference these files in the markdown instead of embedding the base64 data + +You can specify a custom directory for extracted images using the `--image-dir` option. If not specified, images will be saved in a subdirectory called "images" within the output directory. + ## OCR Response Format The OCR API returns a JSON response with the following structure: diff --git a/mistral_ocr/__main__.py b/mistral_ocr/__main__.py index fac57a1..e3c5b93 100644 --- a/mistral_ocr/__main__.py +++ b/mistral_ocr/__main__.py @@ -32,6 +32,8 @@ def main(): convert_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files") convert_parser.add_argument("-o", "--output-file", help="Output filename for single file mode (default: document.md)") convert_parser.add_argument("--images", action="store_true", help="Include images in markdown (if available)") + convert_parser.add_argument("--extract-images", action="store_true", help="Extract images to files instead of embedding them") + convert_parser.add_argument("--image-dir", help="Directory to store extracted images (default: output_dir/images)") convert_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages") convert_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title") convert_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page") @@ -43,6 +45,8 @@ def main(): markdown_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files") markdown_parser.add_argument("-o", "--output-file", help="Path for output markdown file (implies --single-file)") markdown_parser.add_argument("--images", action="store_true", help="Include extracted images in markdown (if available)") + markdown_parser.add_argument("--extract-images", action="store_true", help="Extract images to files instead of embedding them") + markdown_parser.add_argument("--image-dir", help="Directory to store extracted images (default: output_dir/images)") markdown_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages") markdown_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title") markdown_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page") diff --git a/mistral_ocr/commands/convert.py b/mistral_ocr/commands/convert.py index 6c4c13f..98013f3 100644 --- a/mistral_ocr/commands/convert.py +++ b/mistral_ocr/commands/convert.py @@ -2,6 +2,8 @@ import json import os import sys import re +import base64 +import uuid from pathlib import Path def run(args): @@ -78,36 +80,94 @@ class OCRResponse: self.pages = pages or [] self.metadata = metadata or OCRResponseMetadata() -def replace_image_references(content, images, include_images): +def extract_image_to_file(image_base64, image_id, image_dir): """ - Replace image references in markdown content with base64 data. + Extract a base64-encoded image to a file. + + Args: + image_base64 (str): Base64-encoded image data + image_id (str): Unique identifier for the image + image_dir (str): Directory to save the image + + Returns: + str: Path to the saved image file (relative to the markdown file) + """ + # Create the image directory if it doesn't exist + os.makedirs(image_dir, exist_ok=True) + + # Clean up the base64 data if it has a data URL prefix + if image_base64.startswith("data:"): + # Extract the image type and base64 data + match = re.match(r"data:image/(\w+);base64,(.+)", image_base64) + if match: + image_type, image_data = match.groups() + else: + # Default to JPEG if format can't be determined + image_type = "jpeg" + image_data = image_base64.split(",", 1)[1] + else: + # Default to JPEG if no data URL prefix + image_type = "jpeg" + image_data = image_base64 + + # Generate a filename based on the image ID + # Use a sanitized version of the image ID or a UUID if needed + safe_id = re.sub(r'[^\w\-_]', '_', image_id) or str(uuid.uuid4()) + filename = f"{safe_id}.{image_type}" + file_path = os.path.join(image_dir, filename) + + # Decode and save the image + try: + with open(file_path, 'wb') as f: + f.write(base64.b64decode(image_data)) + return os.path.join(os.path.basename(image_dir), filename) + except Exception as e: + print(f"Warning: Failed to save image {image_id}: {e}", file=sys.stderr) + return None + +def replace_image_references(content, images, include_images, extract_images=False, image_dir=None): + """ + Replace image references in markdown content with either base64 data or file references. Args: content (str): Markdown content with image references images (list): List of OCRResponseImage objects include_images (bool): Whether to include images in the output + extract_images (bool, optional): Whether to extract images to files. Defaults to False. + image_dir (str, optional): Directory to save extracted images. Defaults to None. Returns: - str: Markdown content with image references replaced with base64 data + str: Markdown content with image references replaced """ if not include_images or not images: return content - # Create a map of image IDs to their base64 data + # Create a map of image IDs to their image data (either base64 or file path) image_map = {} for img in images: if img.image_base64: - img_data = img.image_base64 - if not img_data.startswith("data:"): - img_data = "data:image/jpeg;base64," + img_data - image_map[img.id] = img_data + if extract_images and image_dir: + # Extract the image to a file and use the file path + img_data = img.image_base64 + if not img_data.startswith("data:"): + img_data = "data:image/jpeg;base64," + img_data + + file_path = extract_image_to_file(img_data, img.id, image_dir) + if file_path: + image_map[img.id] = file_path + else: + # Use base64 data directly + img_data = img.image_base64 + if not img_data.startswith("data:"): + img_data = "data:image/jpeg;base64," + img_data + image_map[img.id] = img_data - # Replace all image references with base64 data - for id, base64_data in image_map.items(): + # Replace all image references + for id, image_ref in image_map.items(): # Escape special characters in the ID for regex escaped_id = re.escape(id) pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)" - replacement = f"![{id}]({base64_data})" + replacement = f"![{id}]({image_ref})" content = re.sub(pattern, replacement, content) @@ -124,6 +184,14 @@ def convert_json_to_markdown(json_file, args): Raises: SystemExit: If an error occurs during conversion """ + # Determine image directory if extracting images + image_dir = None + if args.extract_images and args.images: + if args.image_dir: + image_dir = args.image_dir + else: + # Default to output_dir/images + image_dir = os.path.join(args.output_dir, "images") try: # Read JSON file with open(json_file, 'r', encoding='utf-8') as f: @@ -199,10 +267,16 @@ def convert_json_to_markdown(json_file, args): # Convert page images to OCRResponseImage format page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images] - # Replace image references in markdown content if includeImages is true + # Replace image references in markdown content if images is true page_content = page.markdown if args.images: - page_content = replace_image_references(page_content, page_images, args.images) + page_content = replace_image_references( + page_content, + page_images, + args.images, + args.extract_images, + image_dir + ) # Add page content combined.append(page_content) @@ -240,7 +314,13 @@ def convert_json_to_markdown(json_file, args): # Get page content with image references replaced if needed markdown_content = page.markdown if args.images: - markdown_content = replace_image_references(markdown_content, page_images, args.images) + markdown_content = replace_image_references( + markdown_content, + page_images, + args.images, + args.extract_images, + image_dir + ) with open(output_path, 'w', encoding='utf-8') as f: f.write(markdown_content) diff --git a/mistral_ocr/commands/markdown.py b/mistral_ocr/commands/markdown.py index c67bc71..09bdf9b 100644 --- a/mistral_ocr/commands/markdown.py +++ b/mistral_ocr/commands/markdown.py @@ -18,8 +18,8 @@ def run(args): Raises: SystemExit: If an error occurs during processing or conversion """ - # Ensure that if --images is true, include_image_base64 is also true - include_image_base64 = args.images + # Ensure that if --images or --extract-images is true, include_image_base64 is also true + include_image_base64 = args.images or args.extract_images # If output file is specified, enable single file mode if args.output_file: