Add feature to extract images as separate files

This commit is contained in:
2025-04-24 21:44:49 +02:00
parent 012755b7f4
commit 220864d52f
6 changed files with 197 additions and 17 deletions
+4
View File
@@ -32,6 +32,8 @@ def main():
convert_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files")
convert_parser.add_argument("-o", "--output-file", help="Output filename for single file mode (default: document.md)")
convert_parser.add_argument("--images", action="store_true", help="Include images in markdown (if available)")
convert_parser.add_argument("--extract-images", action="store_true", help="Extract images to files instead of embedding them")
convert_parser.add_argument("--image-dir", help="Directory to store extracted images (default: output_dir/images)")
convert_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages")
convert_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title")
convert_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page")
@@ -43,6 +45,8 @@ def main():
markdown_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files")
markdown_parser.add_argument("-o", "--output-file", help="Path for output markdown file (implies --single-file)")
markdown_parser.add_argument("--images", action="store_true", help="Include extracted images in markdown (if available)")
markdown_parser.add_argument("--extract-images", action="store_true", help="Extract images to files instead of embedding them")
markdown_parser.add_argument("--image-dir", help="Directory to store extracted images (default: output_dir/images)")
markdown_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages")
markdown_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title")
markdown_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page")
+94 -14
View File
@@ -2,6 +2,8 @@ import json
import os
import sys
import re
import base64
import uuid
from pathlib import Path
def run(args):
@@ -78,36 +80,94 @@ class OCRResponse:
self.pages = pages or []
self.metadata = metadata or OCRResponseMetadata()
def replace_image_references(content, images, include_images):
def extract_image_to_file(image_base64, image_id, image_dir):
"""
Replace image references in markdown content with base64 data.
Extract a base64-encoded image to a file.
Args:
image_base64 (str): Base64-encoded image data
image_id (str): Unique identifier for the image
image_dir (str): Directory to save the image
Returns:
str: Path to the saved image file (relative to the markdown file)
"""
# Create the image directory if it doesn't exist
os.makedirs(image_dir, exist_ok=True)
# Clean up the base64 data if it has a data URL prefix
if image_base64.startswith("data:"):
# Extract the image type and base64 data
match = re.match(r"data:image/(\w+);base64,(.+)", image_base64)
if match:
image_type, image_data = match.groups()
else:
# Default to JPEG if format can't be determined
image_type = "jpeg"
image_data = image_base64.split(",", 1)[1]
else:
# Default to JPEG if no data URL prefix
image_type = "jpeg"
image_data = image_base64
# Generate a filename based on the image ID
# Use a sanitized version of the image ID or a UUID if needed
safe_id = re.sub(r'[^\w\-_]', '_', image_id) or str(uuid.uuid4())
filename = f"{safe_id}.{image_type}"
file_path = os.path.join(image_dir, filename)
# Decode and save the image
try:
with open(file_path, 'wb') as f:
f.write(base64.b64decode(image_data))
return os.path.join(os.path.basename(image_dir), filename)
except Exception as e:
print(f"Warning: Failed to save image {image_id}: {e}", file=sys.stderr)
return None
def replace_image_references(content, images, include_images, extract_images=False, image_dir=None):
"""
Replace image references in markdown content with either base64 data or file references.
Args:
content (str): Markdown content with image references
images (list): List of OCRResponseImage objects
include_images (bool): Whether to include images in the output
extract_images (bool, optional): Whether to extract images to files. Defaults to False.
image_dir (str, optional): Directory to save extracted images. Defaults to None.
Returns:
str: Markdown content with image references replaced with base64 data
str: Markdown content with image references replaced
"""
if not include_images or not images:
return content
# Create a map of image IDs to their base64 data
# Create a map of image IDs to their image data (either base64 or file path)
image_map = {}
for img in images:
if img.image_base64:
img_data = img.image_base64
if not img_data.startswith("data:"):
img_data = "data:image/jpeg;base64," + img_data
image_map[img.id] = img_data
if extract_images and image_dir:
# Extract the image to a file and use the file path
img_data = img.image_base64
if not img_data.startswith("data:"):
img_data = "data:image/jpeg;base64," + img_data
file_path = extract_image_to_file(img_data, img.id, image_dir)
if file_path:
image_map[img.id] = file_path
else:
# Use base64 data directly
img_data = img.image_base64
if not img_data.startswith("data:"):
img_data = "data:image/jpeg;base64," + img_data
image_map[img.id] = img_data
# Replace all image references with base64 data
for id, base64_data in image_map.items():
# Replace all image references
for id, image_ref in image_map.items():
# Escape special characters in the ID for regex
escaped_id = re.escape(id)
pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)"
replacement = f"![{id}]({base64_data})"
replacement = f"![{id}]({image_ref})"
content = re.sub(pattern, replacement, content)
@@ -124,6 +184,14 @@ def convert_json_to_markdown(json_file, args):
Raises:
SystemExit: If an error occurs during conversion
"""
# Determine image directory if extracting images
image_dir = None
if args.extract_images and args.images:
if args.image_dir:
image_dir = args.image_dir
else:
# Default to output_dir/images
image_dir = os.path.join(args.output_dir, "images")
try:
# Read JSON file
with open(json_file, 'r', encoding='utf-8') as f:
@@ -199,10 +267,16 @@ def convert_json_to_markdown(json_file, args):
# Convert page images to OCRResponseImage format
page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
# Replace image references in markdown content if includeImages is true
# Replace image references in markdown content if images is true
page_content = page.markdown
if args.images:
page_content = replace_image_references(page_content, page_images, args.images)
page_content = replace_image_references(
page_content,
page_images,
args.images,
args.extract_images,
image_dir
)
# Add page content
combined.append(page_content)
@@ -240,7 +314,13 @@ def convert_json_to_markdown(json_file, args):
# Get page content with image references replaced if needed
markdown_content = page.markdown
if args.images:
markdown_content = replace_image_references(markdown_content, page_images, args.images)
markdown_content = replace_image_references(
markdown_content,
page_images,
args.images,
args.extract_images,
image_dir
)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
+2 -2
View File
@@ -18,8 +18,8 @@ def run(args):
Raises:
SystemExit: If an error occurs during processing or conversion
"""
# Ensure that if --images is true, include_image_base64 is also true
include_image_base64 = args.images
# Ensure that if --images or --extract-images is true, include_image_base64 is also true
include_image_base64 = args.images or args.extract_images
# If output file is specified, enable single file mode
if args.output_file: