From 220864d52f54520494052f77dd4989b19445c6d7 Mon Sep 17 00:00:00 2001
From: Heiko Joerg Schick <info@schihei.de>
Date: Thu, 24 Apr 2025 21:44:49 +0200
Subject: [PATCH] Add feature to extract images as separate files

---
 API.md                           |  67 ++++++++++++++++++-
 CHANGELOG.md                     |   1 +
 README.md                        |  30 +++++++++
 mistral_ocr/__main__.py          |   4 ++
 mistral_ocr/commands/convert.py  | 108 +++++++++++++++++++++++++++----
 mistral_ocr/commands/markdown.py |   4 +-
 6 files changed, 197 insertions(+), 17 deletions(-)

diff --git a/API.md b/API.md
index 6d06b2b..bb4266a 100644
--- a/API.md
+++ b/API.md
@@ -13,6 +13,8 @@ This document provides detailed information about the Mistral OCR API response f
   - [Working with the API Response](#working-with-the-api-response)
     - [Parsing the JSON Response](#parsing-the-json-response)
     - [Handling Images](#handling-images)
+      - [1. Embedded Images](#1-embedded-images)
+      - [2. Extracted Images](#2-extracted-images)
     - [Working with Markdown Content](#working-with-markdown-content)
   - [Error Handling](#error-handling)
     - [API Key Errors](#api-key-errors)
@@ -113,7 +115,11 @@ for page in ocr_data.get('pages', []):
 
 ### Handling Images
 
-If you've included images in the response (using the `--include-images` flag), you can extract and save them:
+The Mistral OCR CLI provides two approaches for handling images:
+
+#### 1. Embedded Images
+
+When using the `--images` flag without `--extract-images`, images are embedded directly in the markdown as base64 data. If you've included images in the response (using the `--include-images` flag), you can extract and save them manually:
 
 ```python
 import base64
@@ -141,6 +147,65 @@ for page in ocr_data.get('pages', []):
                 img_file.write(img_bytes)
 ```
 
+#### 2. Extracted Images
+
+Alternatively, you can use the `--extract-images` flag with the CLI to automatically extract images to separate files. This approach:
+
+- Saves each image as a separate file in the specified directory (or `output_dir/images` by default)
+- Updates the markdown to reference these image files instead of embedding base64 data
+- Results in smaller, more manageable markdown files
+
+Example command:
+```bash
+mistral-ocr markdown document.pdf --images --extract-images --image-dir custom_images
+```
+
+If you're working with the API directly and want to implement similar functionality, here's how you might do it:
+
+```python
+import base64
+import os
+import re
+
+def extract_images_from_ocr_data(ocr_data, image_dir='images'):
+    """Extract images from OCR data and update markdown references."""
+    # Create image directory
+    os.makedirs(image_dir, exist_ok=True)
+    
+    # Process each page
+    for page in ocr_data.get('pages', []):
+        page_index = page.get('index', 0)
+        markdown = page.get('markdown', '')
+        
+        # Extract and save images
+        for img_index, image in enumerate(page.get('images', [])):
+            img_id = image.get('id', f'unknown-{img_index}')
+            img_data = image.get('image_base64', '')
+            
+            if img_data:
+                # Generate filename
+                filename = f"{img_id.replace(' ', '_')}.jpg"
+                filepath = os.path.join(image_dir, filename)
+                
+                # Remove data URL prefix if present
+                if ',' in img_data:
+                    img_data = img_data.split(',', 1)[1]
+                
+                # Save the image
+                with open(filepath, 'wb') as img_file:
+                    img_file.write(base64.b64decode(img_data))
+                
+                # Update markdown to reference the file
+                pattern = f"!\\[{re.escape(img_id)}\\]\\({re.escape(img_id)}\\)"
+                replacement = f"![{img_id}]({os.path.join(os.path.basename(image_dir), filename)})"
+                markdown = re.sub(pattern, replacement, markdown)
+        
+        # Update the page's markdown
+        page['markdown'] = markdown
+    
+    return ocr_data
+```
+
 ### Working with Markdown Content
 
 The OCR results are provided in Markdown format, which makes it easy to convert to other formats or display in applications:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce95f6b..56a9985 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Enhanced documentation including README.md, CONTRIBUTING.md, and CHANGELOG.md
 - More detailed troubleshooting section
 - API response format documentation
+- Option to extract images to separate files instead of embedding them in markdown
 
 ## [0.1.0] - 2025-04-24
 
diff --git a/README.md b/README.md
index b19c21c..ddd6d98 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,12 @@ mistral-ocr convert results.json --output-file document.md
 
 # Include images in markdown (if available in JSON)
 mistral-ocr convert results.json --images
+
+# Extract images to files instead of embedding them in markdown
+mistral-ocr convert results.json --images --extract-images
+
+# Specify a custom directory for extracted images
+mistral-ocr convert results.json --images --extract-images --image-dir images_folder
 ```
 
 #### Process and Convert in One Step
@@ -134,6 +140,12 @@ mistral-ocr markdown path/to/document.pdf --output-file docs/result.md
 
 # Save intermediate JSON and generate markdown files
 mistral-ocr markdown path/to/document.pdf --json-file results.json --output-dir docs
+
+# Extract images to files instead of embedding them in markdown
+mistral-ocr markdown path/to/document.pdf --images --extract-images
+
+# Specify a custom directory for extracted images
+mistral-ocr markdown path/to/document.pdf --images --extract-images --image-dir custom_images
 ```
 
 This command combines the `process` and `convert` steps, creating markdown files directly from the document.
@@ -182,8 +194,26 @@ mistral-ocr markdown ~/Documents/research-paper.pdf --single-file --output-dir r
 
 # Generate a single markdown file with specific filename
 mistral-ocr markdown ~/Documents/research-paper.pdf --output-file research_docs/paper.md
+
+# Process a document and extract images to separate files
+mistral-ocr markdown ~/Documents/research-paper.pdf --images --extract-images --output-dir research_docs
 ```
 
+## Image Handling
+
+The tool provides several options for handling images in the OCR output:
+
+1. **No images**: By default, images are not included in the output.
+
+2. **Embedded images**: Using the `--images` flag without `--extract-images` will embed base64-encoded images directly in the markdown file. This creates a self-contained document but can result in very large files.
+
+3. **Extracted images**: Using both `--images` and `--extract-images` flags will:
+   - Extract images from the OCR results
+   - Save them as separate files in an images directory
+   - Reference these files in the markdown instead of embedding the base64 data
+
+You can specify a custom directory for extracted images using the `--image-dir` option. If not specified, images will be saved in a subdirectory called "images" within the output directory.
+
 ## OCR Response Format
 
 The OCR API returns a JSON response with the following structure:
diff --git a/mistral_ocr/__main__.py b/mistral_ocr/__main__.py
index fac57a1..e3c5b93 100644
--- a/mistral_ocr/__main__.py
+++ b/mistral_ocr/__main__.py
@@ -32,6 +32,8 @@ def main():
     convert_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files")
     convert_parser.add_argument("-o", "--output-file", help="Output filename for single file mode (default: document.md)")
     convert_parser.add_argument("--images", action="store_true", help="Include images in markdown (if available)")
+    convert_parser.add_argument("--extract-images", action="store_true", help="Extract images to files instead of embedding them")
+    convert_parser.add_argument("--image-dir", help="Directory to store extracted images (default: output_dir/images)")
     convert_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages")
     convert_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title")
     convert_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page")
@@ -43,6 +45,8 @@ def main():
     markdown_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files")
     markdown_parser.add_argument("-o", "--output-file", help="Path for output markdown file (implies --single-file)")
     markdown_parser.add_argument("--images", action="store_true", help="Include extracted images in markdown (if available)")
+    markdown_parser.add_argument("--extract-images", action="store_true", help="Extract images to files instead of embedding them")
+    markdown_parser.add_argument("--image-dir", help="Directory to store extracted images (default: output_dir/images)")
     markdown_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages")
     markdown_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title")
     markdown_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page")
diff --git a/mistral_ocr/commands/convert.py b/mistral_ocr/commands/convert.py
index 6c4c13f..98013f3 100644
--- a/mistral_ocr/commands/convert.py
+++ b/mistral_ocr/commands/convert.py
@@ -2,6 +2,8 @@ import json
 import os
 import sys
 import re
+import base64
+import uuid
 from pathlib import Path
 
 def run(args):
@@ -78,36 +80,94 @@ class OCRResponse:
         self.pages = pages or []
         self.metadata = metadata or OCRResponseMetadata()
 
-def replace_image_references(content, images, include_images):
+def extract_image_to_file(image_base64, image_id, image_dir):
     """
-    Replace image references in markdown content with base64 data.
+    Extract a base64-encoded image to a file.
+    
+    Args:
+        image_base64 (str): Base64-encoded image data
+        image_id (str): Unique identifier for the image
+        image_dir (str): Directory to save the image
+        
+    Returns:
+        str: Path to the saved image file (relative to the markdown file)
+    """
+    # Create the image directory if it doesn't exist
+    os.makedirs(image_dir, exist_ok=True)
+    
+    # Clean up the base64 data if it has a data URL prefix
+    if image_base64.startswith("data:"):
+        # Extract the image type and base64 data
+        match = re.match(r"data:image/(\w+);base64,(.+)", image_base64)
+        if match:
+            image_type, image_data = match.groups()
+        else:
+            # Default to JPEG if format can't be determined
+            image_type = "jpeg"
+            image_data = image_base64.split(",", 1)[1]
+    else:
+        # Default to JPEG if no data URL prefix
+        image_type = "jpeg"
+        image_data = image_base64
+    
+    # Generate a filename based on the image ID
+    # Use a sanitized version of the image ID or a UUID if needed
+    safe_id = re.sub(r'[^\w\-_]', '_', image_id) or str(uuid.uuid4())
+    filename = f"{safe_id}.{image_type}"
+    file_path = os.path.join(image_dir, filename)
+    
+    # Decode and save the image
+    try:
+        with open(file_path, 'wb') as f:
+            f.write(base64.b64decode(image_data))
+        return os.path.join(os.path.basename(image_dir), filename)
+    except Exception as e:
+        print(f"Warning: Failed to save image {image_id}: {e}", file=sys.stderr)
+        return None
+
+def replace_image_references(content, images, include_images, extract_images=False, image_dir=None):
+    """
+    Replace image references in markdown content with either base64 data or file references.
     
     Args:
         content (str): Markdown content with image references
         images (list): List of OCRResponseImage objects
         include_images (bool): Whether to include images in the output
+        extract_images (bool, optional): Whether to extract images to files. Defaults to False.
+        image_dir (str, optional): Directory to save extracted images. Defaults to None.
         
     Returns:
-        str: Markdown content with image references replaced with base64 data
+        str: Markdown content with image references replaced
     """
     if not include_images or not images:
         return content
     
-    # Create a map of image IDs to their base64 data
+    # Create a map of image IDs to their image data (either base64 or file path)
     image_map = {}
     for img in images:
         if img.image_base64:
-            img_data = img.image_base64
-            if not img_data.startswith("data:"):
-                img_data = "data:image/jpeg;base64," + img_data
-            image_map[img.id] = img_data
+            if extract_images and image_dir:
+                # Extract the image to a file and use the file path
+                img_data = img.image_base64
+                if not img_data.startswith("data:"):
+                    img_data = "data:image/jpeg;base64," + img_data
+                
+                file_path = extract_image_to_file(img_data, img.id, image_dir)
+                if file_path:
+                    image_map[img.id] = file_path
+            else:
+                # Use base64 data directly
+                img_data = img.image_base64
+                if not img_data.startswith("data:"):
+                    img_data = "data:image/jpeg;base64," + img_data
+                image_map[img.id] = img_data
     
-    # Replace all image references with base64 data
-    for id, base64_data in image_map.items():
+    # Replace all image references
+    for id, image_ref in image_map.items():
         # Escape special characters in the ID for regex
         escaped_id = re.escape(id)
         pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)"
-        replacement = f"![{id}]({base64_data})"
+        replacement = f"![{id}]({image_ref})"
         
         content = re.sub(pattern, replacement, content)
     
@@ -124,6 +184,14 @@ def convert_json_to_markdown(json_file, args):
     Raises:
         SystemExit: If an error occurs during conversion
     """
+    # Determine image directory if extracting images
+    image_dir = None
+    if args.extract_images and args.images:
+        if args.image_dir:
+            image_dir = args.image_dir
+        else:
+            # Default to output_dir/images
+            image_dir = os.path.join(args.output_dir, "images")
     try:
         # Read JSON file
         with open(json_file, 'r', encoding='utf-8') as f:
@@ -199,10 +267,16 @@ def convert_json_to_markdown(json_file, args):
                 # Convert page images to OCRResponseImage format
                 page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
                 
-                # Replace image references in markdown content if includeImages is true
+                # Replace image references in markdown content if images is true
                 page_content = page.markdown
                 if args.images:
-                    page_content = replace_image_references(page_content, page_images, args.images)
+                    page_content = replace_image_references(
+                        page_content, 
+                        page_images, 
+                        args.images, 
+                        args.extract_images, 
+                        image_dir
+                    )
                 
                 # Add page content
                 combined.append(page_content)
@@ -240,7 +314,13 @@ def convert_json_to_markdown(json_file, args):
                 # Get page content with image references replaced if needed
                 markdown_content = page.markdown
                 if args.images:
-                    markdown_content = replace_image_references(markdown_content, page_images, args.images)
+                    markdown_content = replace_image_references(
+                        markdown_content, 
+                        page_images, 
+                        args.images, 
+                        args.extract_images, 
+                        image_dir
+                    )
                 
                 with open(output_path, 'w', encoding='utf-8') as f:
                     f.write(markdown_content)
diff --git a/mistral_ocr/commands/markdown.py b/mistral_ocr/commands/markdown.py
index c67bc71..09bdf9b 100644
--- a/mistral_ocr/commands/markdown.py
+++ b/mistral_ocr/commands/markdown.py
@@ -18,8 +18,8 @@ def run(args):
     Raises:
         SystemExit: If an error occurs during processing or conversion
     """
-    # Ensure that if --images is true, include_image_base64 is also true
-    include_image_base64 = args.images
+    # Ensure that if --images or --extract-images is true, include_image_base64 is also true
+    include_image_base64 = args.images or args.extract_images
     
     # If output file is specified, enable single file mode
     if args.output_file: