Add feature to extract images as separate files
This commit is contained in:
@@ -13,6 +13,8 @@ This document provides detailed information about the Mistral OCR API response f
|
|||||||
- [Working with the API Response](#working-with-the-api-response)
|
- [Working with the API Response](#working-with-the-api-response)
|
||||||
- [Parsing the JSON Response](#parsing-the-json-response)
|
- [Parsing the JSON Response](#parsing-the-json-response)
|
||||||
- [Handling Images](#handling-images)
|
- [Handling Images](#handling-images)
|
||||||
|
- [1. Embedded Images](#1-embedded-images)
|
||||||
|
- [2. Extracted Images](#2-extracted-images)
|
||||||
- [Working with Markdown Content](#working-with-markdown-content)
|
- [Working with Markdown Content](#working-with-markdown-content)
|
||||||
- [Error Handling](#error-handling)
|
- [Error Handling](#error-handling)
|
||||||
- [API Key Errors](#api-key-errors)
|
- [API Key Errors](#api-key-errors)
|
||||||
@@ -113,7 +115,11 @@ for page in ocr_data.get('pages', []):
|
|||||||
|
|
||||||
### Handling Images
|
### Handling Images
|
||||||
|
|
||||||
If you've included images in the response (using the `--include-images` flag), you can extract and save them:
|
The Mistral OCR CLI provides two approaches for handling images:
|
||||||
|
|
||||||
|
#### 1. Embedded Images
|
||||||
|
|
||||||
|
When using the `--images` flag without `--extract-images`, images are embedded directly in the markdown as base64 data. If you've included images in the response (using the `--include-images` flag), you can extract and save them manually:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import base64
|
import base64
|
||||||
@@ -141,6 +147,65 @@ for page in ocr_data.get('pages', []):
|
|||||||
img_file.write(img_bytes)
|
img_file.write(img_bytes)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### 2. Extracted Images
|
||||||
|
|
||||||
|
Alternatively, you can use the `--extract-images` flag with the CLI to automatically extract images to separate files. This approach:
|
||||||
|
|
||||||
|
- Saves each image as a separate file in the specified directory (or `output_dir/images` by default)
|
||||||
|
- Updates the markdown to reference these image files instead of embedding base64 data
|
||||||
|
- Results in smaller, more manageable markdown files
|
||||||
|
|
||||||
|
Example command:
|
||||||
|
```bash
|
||||||
|
mistral-ocr markdown document.pdf --images --extract-images --image-dir custom_images
|
||||||
|
```
|
||||||
|
|
||||||
|
If you're working with the API directly and want to implement similar functionality, here's how you might do it:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
def extract_images_from_ocr_data(ocr_data, image_dir='images'):
|
||||||
|
"""Extract images from OCR data and update markdown references."""
|
||||||
|
# Create image directory
|
||||||
|
os.makedirs(image_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Process each page
|
||||||
|
for page in ocr_data.get('pages', []):
|
||||||
|
page_index = page.get('index', 0)
|
||||||
|
markdown = page.get('markdown', '')
|
||||||
|
|
||||||
|
# Extract and save images
|
||||||
|
for img_index, image in enumerate(page.get('images', [])):
|
||||||
|
img_id = image.get('id', f'unknown-{img_index}')
|
||||||
|
img_data = image.get('image_base64', '')
|
||||||
|
|
||||||
|
if img_data:
|
||||||
|
# Generate filename
|
||||||
|
filename = f"{img_id.replace(' ', '_')}.jpg"
|
||||||
|
filepath = os.path.join(image_dir, filename)
|
||||||
|
|
||||||
|
# Remove data URL prefix if present
|
||||||
|
if ',' in img_data:
|
||||||
|
img_data = img_data.split(',', 1)[1]
|
||||||
|
|
||||||
|
# Save the image
|
||||||
|
with open(filepath, 'wb') as img_file:
|
||||||
|
img_file.write(base64.b64decode(img_data))
|
||||||
|
|
||||||
|
# Update markdown to reference the file
|
||||||
|
pattern = f"!\\[{re.escape(img_id)}\\]\\({re.escape(img_id)}\\)"
|
||||||
|
replacement = f", filename)})"
|
||||||
|
markdown = re.sub(pattern, replacement, markdown)
|
||||||
|
|
||||||
|
# Update the page's markdown
|
||||||
|
page['markdown'] = markdown
|
||||||
|
|
||||||
|
return ocr_data
|
||||||
|
```
|
||||||
|
|
||||||
### Working with Markdown Content
|
### Working with Markdown Content
|
||||||
|
|
||||||
The OCR results are provided in Markdown format, which makes it easy to convert to other formats or display in applications:
|
The OCR results are provided in Markdown format, which makes it easy to convert to other formats or display in applications:
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- Enhanced documentation including README.md, CONTRIBUTING.md, and CHANGELOG.md
|
- Enhanced documentation including README.md, CONTRIBUTING.md, and CHANGELOG.md
|
||||||
- More detailed troubleshooting section
|
- More detailed troubleshooting section
|
||||||
- API response format documentation
|
- API response format documentation
|
||||||
|
- Option to extract images to separate files instead of embedding them in markdown
|
||||||
|
|
||||||
## [0.1.0] - 2025-04-24
|
## [0.1.0] - 2025-04-24
|
||||||
|
|
||||||
|
|||||||
@@ -113,6 +113,12 @@ mistral-ocr convert results.json --output-file document.md
|
|||||||
|
|
||||||
# Include images in markdown (if available in JSON)
|
# Include images in markdown (if available in JSON)
|
||||||
mistral-ocr convert results.json --images
|
mistral-ocr convert results.json --images
|
||||||
|
|
||||||
|
# Extract images to files instead of embedding them in markdown
|
||||||
|
mistral-ocr convert results.json --images --extract-images
|
||||||
|
|
||||||
|
# Specify a custom directory for extracted images
|
||||||
|
mistral-ocr convert results.json --images --extract-images --image-dir images_folder
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Process and Convert in One Step
|
#### Process and Convert in One Step
|
||||||
@@ -134,6 +140,12 @@ mistral-ocr markdown path/to/document.pdf --output-file docs/result.md
|
|||||||
|
|
||||||
# Save intermediate JSON and generate markdown files
|
# Save intermediate JSON and generate markdown files
|
||||||
mistral-ocr markdown path/to/document.pdf --json-file results.json --output-dir docs
|
mistral-ocr markdown path/to/document.pdf --json-file results.json --output-dir docs
|
||||||
|
|
||||||
|
# Extract images to files instead of embedding them in markdown
|
||||||
|
mistral-ocr markdown path/to/document.pdf --images --extract-images
|
||||||
|
|
||||||
|
# Specify a custom directory for extracted images
|
||||||
|
mistral-ocr markdown path/to/document.pdf --images --extract-images --image-dir custom_images
|
||||||
```
|
```
|
||||||
|
|
||||||
This command combines the `process` and `convert` steps, creating markdown files directly from the document.
|
This command combines the `process` and `convert` steps, creating markdown files directly from the document.
|
||||||
@@ -182,8 +194,26 @@ mistral-ocr markdown ~/Documents/research-paper.pdf --single-file --output-dir r
|
|||||||
|
|
||||||
# Generate a single markdown file with specific filename
|
# Generate a single markdown file with specific filename
|
||||||
mistral-ocr markdown ~/Documents/research-paper.pdf --output-file research_docs/paper.md
|
mistral-ocr markdown ~/Documents/research-paper.pdf --output-file research_docs/paper.md
|
||||||
|
|
||||||
|
# Process a document and extract images to separate files
|
||||||
|
mistral-ocr markdown ~/Documents/research-paper.pdf --images --extract-images --output-dir research_docs
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Image Handling
|
||||||
|
|
||||||
|
The tool provides several options for handling images in the OCR output:
|
||||||
|
|
||||||
|
1. **No images**: By default, images are not included in the output.
|
||||||
|
|
||||||
|
2. **Embedded images**: Using the `--images` flag without `--extract-images` will embed base64-encoded images directly in the markdown file. This creates a self-contained document but can result in very large files.
|
||||||
|
|
||||||
|
3. **Extracted images**: Using both `--images` and `--extract-images` flags will:
|
||||||
|
- Extract images from the OCR results
|
||||||
|
- Save them as separate files in an images directory
|
||||||
|
- Reference these files in the markdown instead of embedding the base64 data
|
||||||
|
|
||||||
|
You can specify a custom directory for extracted images using the `--image-dir` option. If not specified, images will be saved in a subdirectory called "images" within the output directory.
|
||||||
|
|
||||||
## OCR Response Format
|
## OCR Response Format
|
||||||
|
|
||||||
The OCR API returns a JSON response with the following structure:
|
The OCR API returns a JSON response with the following structure:
|
||||||
|
|||||||
@@ -32,6 +32,8 @@ def main():
|
|||||||
convert_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files")
|
convert_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files")
|
||||||
convert_parser.add_argument("-o", "--output-file", help="Output filename for single file mode (default: document.md)")
|
convert_parser.add_argument("-o", "--output-file", help="Output filename for single file mode (default: document.md)")
|
||||||
convert_parser.add_argument("--images", action="store_true", help="Include images in markdown (if available)")
|
convert_parser.add_argument("--images", action="store_true", help="Include images in markdown (if available)")
|
||||||
|
convert_parser.add_argument("--extract-images", action="store_true", help="Extract images to files instead of embedding them")
|
||||||
|
convert_parser.add_argument("--image-dir", help="Directory to store extracted images (default: output_dir/images)")
|
||||||
convert_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages")
|
convert_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages")
|
||||||
convert_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title")
|
convert_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title")
|
||||||
convert_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page")
|
convert_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page")
|
||||||
@@ -43,6 +45,8 @@ def main():
|
|||||||
markdown_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files")
|
markdown_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files")
|
||||||
markdown_parser.add_argument("-o", "--output-file", help="Path for output markdown file (implies --single-file)")
|
markdown_parser.add_argument("-o", "--output-file", help="Path for output markdown file (implies --single-file)")
|
||||||
markdown_parser.add_argument("--images", action="store_true", help="Include extracted images in markdown (if available)")
|
markdown_parser.add_argument("--images", action="store_true", help="Include extracted images in markdown (if available)")
|
||||||
|
markdown_parser.add_argument("--extract-images", action="store_true", help="Extract images to files instead of embedding them")
|
||||||
|
markdown_parser.add_argument("--image-dir", help="Directory to store extracted images (default: output_dir/images)")
|
||||||
markdown_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages")
|
markdown_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages")
|
||||||
markdown_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title")
|
markdown_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title")
|
||||||
markdown_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page")
|
markdown_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page")
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ import json
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
|
import base64
|
||||||
|
import uuid
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
def run(args):
|
def run(args):
|
||||||
@@ -78,36 +80,94 @@ class OCRResponse:
|
|||||||
self.pages = pages or []
|
self.pages = pages or []
|
||||||
self.metadata = metadata or OCRResponseMetadata()
|
self.metadata = metadata or OCRResponseMetadata()
|
||||||
|
|
||||||
def replace_image_references(content, images, include_images):
|
def extract_image_to_file(image_base64, image_id, image_dir):
|
||||||
"""
|
"""
|
||||||
Replace image references in markdown content with base64 data.
|
Extract a base64-encoded image to a file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_base64 (str): Base64-encoded image data
|
||||||
|
image_id (str): Unique identifier for the image
|
||||||
|
image_dir (str): Directory to save the image
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Path to the saved image file (relative to the markdown file)
|
||||||
|
"""
|
||||||
|
# Create the image directory if it doesn't exist
|
||||||
|
os.makedirs(image_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Clean up the base64 data if it has a data URL prefix
|
||||||
|
if image_base64.startswith("data:"):
|
||||||
|
# Extract the image type and base64 data
|
||||||
|
match = re.match(r"data:image/(\w+);base64,(.+)", image_base64)
|
||||||
|
if match:
|
||||||
|
image_type, image_data = match.groups()
|
||||||
|
else:
|
||||||
|
# Default to JPEG if format can't be determined
|
||||||
|
image_type = "jpeg"
|
||||||
|
image_data = image_base64.split(",", 1)[1]
|
||||||
|
else:
|
||||||
|
# Default to JPEG if no data URL prefix
|
||||||
|
image_type = "jpeg"
|
||||||
|
image_data = image_base64
|
||||||
|
|
||||||
|
# Generate a filename based on the image ID
|
||||||
|
# Use a sanitized version of the image ID or a UUID if needed
|
||||||
|
safe_id = re.sub(r'[^\w\-_]', '_', image_id) or str(uuid.uuid4())
|
||||||
|
filename = f"{safe_id}.{image_type}"
|
||||||
|
file_path = os.path.join(image_dir, filename)
|
||||||
|
|
||||||
|
# Decode and save the image
|
||||||
|
try:
|
||||||
|
with open(file_path, 'wb') as f:
|
||||||
|
f.write(base64.b64decode(image_data))
|
||||||
|
return os.path.join(os.path.basename(image_dir), filename)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Failed to save image {image_id}: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def replace_image_references(content, images, include_images, extract_images=False, image_dir=None):
|
||||||
|
"""
|
||||||
|
Replace image references in markdown content with either base64 data or file references.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
content (str): Markdown content with image references
|
content (str): Markdown content with image references
|
||||||
images (list): List of OCRResponseImage objects
|
images (list): List of OCRResponseImage objects
|
||||||
include_images (bool): Whether to include images in the output
|
include_images (bool): Whether to include images in the output
|
||||||
|
extract_images (bool, optional): Whether to extract images to files. Defaults to False.
|
||||||
|
image_dir (str, optional): Directory to save extracted images. Defaults to None.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Markdown content with image references replaced with base64 data
|
str: Markdown content with image references replaced
|
||||||
"""
|
"""
|
||||||
if not include_images or not images:
|
if not include_images or not images:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
# Create a map of image IDs to their base64 data
|
# Create a map of image IDs to their image data (either base64 or file path)
|
||||||
image_map = {}
|
image_map = {}
|
||||||
for img in images:
|
for img in images:
|
||||||
if img.image_base64:
|
if img.image_base64:
|
||||||
|
if extract_images and image_dir:
|
||||||
|
# Extract the image to a file and use the file path
|
||||||
|
img_data = img.image_base64
|
||||||
|
if not img_data.startswith("data:"):
|
||||||
|
img_data = "data:image/jpeg;base64," + img_data
|
||||||
|
|
||||||
|
file_path = extract_image_to_file(img_data, img.id, image_dir)
|
||||||
|
if file_path:
|
||||||
|
image_map[img.id] = file_path
|
||||||
|
else:
|
||||||
|
# Use base64 data directly
|
||||||
img_data = img.image_base64
|
img_data = img.image_base64
|
||||||
if not img_data.startswith("data:"):
|
if not img_data.startswith("data:"):
|
||||||
img_data = "data:image/jpeg;base64," + img_data
|
img_data = "data:image/jpeg;base64," + img_data
|
||||||
image_map[img.id] = img_data
|
image_map[img.id] = img_data
|
||||||
|
|
||||||
# Replace all image references with base64 data
|
# Replace all image references
|
||||||
for id, base64_data in image_map.items():
|
for id, image_ref in image_map.items():
|
||||||
# Escape special characters in the ID for regex
|
# Escape special characters in the ID for regex
|
||||||
escaped_id = re.escape(id)
|
escaped_id = re.escape(id)
|
||||||
pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)"
|
pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)"
|
||||||
replacement = f""
|
replacement = f""
|
||||||
|
|
||||||
content = re.sub(pattern, replacement, content)
|
content = re.sub(pattern, replacement, content)
|
||||||
|
|
||||||
@@ -124,6 +184,14 @@ def convert_json_to_markdown(json_file, args):
|
|||||||
Raises:
|
Raises:
|
||||||
SystemExit: If an error occurs during conversion
|
SystemExit: If an error occurs during conversion
|
||||||
"""
|
"""
|
||||||
|
# Determine image directory if extracting images
|
||||||
|
image_dir = None
|
||||||
|
if args.extract_images and args.images:
|
||||||
|
if args.image_dir:
|
||||||
|
image_dir = args.image_dir
|
||||||
|
else:
|
||||||
|
# Default to output_dir/images
|
||||||
|
image_dir = os.path.join(args.output_dir, "images")
|
||||||
try:
|
try:
|
||||||
# Read JSON file
|
# Read JSON file
|
||||||
with open(json_file, 'r', encoding='utf-8') as f:
|
with open(json_file, 'r', encoding='utf-8') as f:
|
||||||
@@ -199,10 +267,16 @@ def convert_json_to_markdown(json_file, args):
|
|||||||
# Convert page images to OCRResponseImage format
|
# Convert page images to OCRResponseImage format
|
||||||
page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
|
page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
|
||||||
|
|
||||||
# Replace image references in markdown content if includeImages is true
|
# Replace image references in markdown content if images is true
|
||||||
page_content = page.markdown
|
page_content = page.markdown
|
||||||
if args.images:
|
if args.images:
|
||||||
page_content = replace_image_references(page_content, page_images, args.images)
|
page_content = replace_image_references(
|
||||||
|
page_content,
|
||||||
|
page_images,
|
||||||
|
args.images,
|
||||||
|
args.extract_images,
|
||||||
|
image_dir
|
||||||
|
)
|
||||||
|
|
||||||
# Add page content
|
# Add page content
|
||||||
combined.append(page_content)
|
combined.append(page_content)
|
||||||
@@ -240,7 +314,13 @@ def convert_json_to_markdown(json_file, args):
|
|||||||
# Get page content with image references replaced if needed
|
# Get page content with image references replaced if needed
|
||||||
markdown_content = page.markdown
|
markdown_content = page.markdown
|
||||||
if args.images:
|
if args.images:
|
||||||
markdown_content = replace_image_references(markdown_content, page_images, args.images)
|
markdown_content = replace_image_references(
|
||||||
|
markdown_content,
|
||||||
|
page_images,
|
||||||
|
args.images,
|
||||||
|
args.extract_images,
|
||||||
|
image_dir
|
||||||
|
)
|
||||||
|
|
||||||
with open(output_path, 'w', encoding='utf-8') as f:
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
f.write(markdown_content)
|
f.write(markdown_content)
|
||||||
|
|||||||
@@ -18,8 +18,8 @@ def run(args):
|
|||||||
Raises:
|
Raises:
|
||||||
SystemExit: If an error occurs during processing or conversion
|
SystemExit: If an error occurs during processing or conversion
|
||||||
"""
|
"""
|
||||||
# Ensure that if --images is true, include_image_base64 is also true
|
# Ensure that if --images or --extract-images is true, include_image_base64 is also true
|
||||||
include_image_base64 = args.images
|
include_image_base64 = args.images or args.extract_images
|
||||||
|
|
||||||
# If output file is specified, enable single file mode
|
# If output file is specified, enable single file mode
|
||||||
if args.output_file:
|
if args.output_file:
|
||||||
|
|||||||
Reference in New Issue
Block a user