Initial commit
This commit is contained in:
+40
@@ -0,0 +1,40 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# Virtual Environment
|
||||
venv/
|
||||
env/
|
||||
ENV/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# OS specific
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Mistral OCR specific
|
||||
markdown_output/
|
||||
*.json
|
||||
@@ -0,0 +1,168 @@
|
||||
# Mistral OCR CLI (Python)
|
||||
|
||||
A command-line tool for processing documents with Mistral AI's OCR capabilities, implemented in Python.
|
||||
|
||||
## Features
|
||||
|
||||
- Process PDF documents and images using Mistral AI's OCR
|
||||
- Extract text and structured content from documents
|
||||
- Process local files or files from URLs
|
||||
- Output results to stdout or to a file
|
||||
- Convert OCR results to Markdown format
|
||||
- Maintain document structure and formatting in the output
|
||||
|
||||
## Installation
|
||||
|
||||
### Requirements
|
||||
|
||||
- Python 3.7 or later
|
||||
- pip (Python package installer)
|
||||
|
||||
### Installing from source
|
||||
|
||||
```bash
|
||||
git clone https://github.com/yourusername/mistral-ocr-python
|
||||
cd mistral-ocr-python
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
Alternatively, you can use the build script:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/yourusername/mistral-ocr-python
|
||||
cd mistral-ocr-python
|
||||
./build.sh
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Setting up your API key
|
||||
|
||||
You can provide your Mistral API key in two ways:
|
||||
|
||||
1. Environment variable:
|
||||
```bash
|
||||
export MISTRAL_API_KEY=your-api-key
|
||||
```
|
||||
|
||||
2. Command line flag:
|
||||
```bash
|
||||
mistral-ocr --api-key=your-api-key [command]
|
||||
```
|
||||
|
||||
### Commands
|
||||
|
||||
#### Process a document
|
||||
|
||||
Process a document file or URL:
|
||||
|
||||
```bash
|
||||
# Process a local PDF file
|
||||
mistral-ocr process path/to/document.pdf
|
||||
|
||||
# Process a document from a URL
|
||||
mistral-ocr process https://example.com/document.pdf
|
||||
|
||||
# Process an image from a URL
|
||||
mistral-ocr process https://example.com/image.jpg
|
||||
|
||||
# Save output to a file
|
||||
mistral-ocr process path/to/document.pdf --output-file results.json
|
||||
|
||||
# Include base64 encoded images in the output
|
||||
mistral-ocr process path/to/document.pdf --include-images
|
||||
```
|
||||
|
||||
#### Convert OCR JSON to Markdown
|
||||
|
||||
Convert previously processed OCR JSON results to Markdown:
|
||||
|
||||
```bash
|
||||
# Convert OCR JSON to Markdown
|
||||
mistral-ocr convert results.json
|
||||
|
||||
# Specify output directory
|
||||
mistral-ocr convert results.json --output-dir output_folder
|
||||
|
||||
# Create a single markdown file instead of one per page
|
||||
mistral-ocr convert results.json --single-file
|
||||
|
||||
# Specify output filename for single file mode
|
||||
mistral-ocr convert results.json --output-file document.md
|
||||
|
||||
# Include images in markdown (if available in JSON)
|
||||
mistral-ocr convert results.json --images
|
||||
```
|
||||
|
||||
#### Process and Convert in One Step
|
||||
|
||||
Process a document and convert to Markdown in a single command:
|
||||
|
||||
```bash
|
||||
# Process document and generate markdown files
|
||||
mistral-ocr markdown path/to/document.pdf
|
||||
|
||||
# Generate a single markdown file instead of separate files per page
|
||||
mistral-ocr markdown path/to/document.pdf --single-file
|
||||
|
||||
# Specify output directory for markdown files
|
||||
mistral-ocr markdown https://example.com/document.pdf --output-dir docs
|
||||
|
||||
# Specify a specific output file path (implies single file)
|
||||
mistral-ocr markdown path/to/document.pdf --output-file docs/result.md
|
||||
|
||||
# Save intermediate JSON and generate markdown files
|
||||
mistral-ocr markdown path/to/document.pdf --json-file results.json --output-dir docs
|
||||
```
|
||||
|
||||
This command combines the `process` and `convert` steps, creating markdown files directly from the document.
|
||||
|
||||
#### Version information
|
||||
|
||||
```bash
|
||||
mistral-ocr version
|
||||
```
|
||||
|
||||
### Examples
|
||||
|
||||
### Process a local PDF and save the output
|
||||
|
||||
```bash
|
||||
mistral-ocr process ~/Documents/sample.pdf --output-file results.json
|
||||
```
|
||||
|
||||
### Process a document from a URL
|
||||
|
||||
```bash
|
||||
mistral-ocr process https://arxiv.org/pdf/2201.04234 > output.json
|
||||
```
|
||||
|
||||
### Convert OCR JSON to Markdown files
|
||||
|
||||
```bash
|
||||
# Create separate files (one per page)
|
||||
mistral-ocr convert output.json --output-dir markdown_docs
|
||||
|
||||
# Create a single file with all pages
|
||||
mistral-ocr convert output.json --single-file --output-dir markdown_docs
|
||||
|
||||
# Create a single file with a specific filename
|
||||
mistral-ocr convert output.json --output-file docs/paper.md
|
||||
```
|
||||
|
||||
### Process a document and generate markdown files in one step
|
||||
|
||||
```bash
|
||||
# Generate separate files (one per page)
|
||||
mistral-ocr markdown ~/Documents/research-paper.pdf --output-dir research_docs
|
||||
|
||||
# Generate a single markdown file
|
||||
mistral-ocr markdown ~/Documents/research-paper.pdf --single-file --output-dir research_docs
|
||||
|
||||
# Generate a single markdown file with specific filename
|
||||
mistral-ocr markdown ~/Documents/research-paper.pdf --output-file research_docs/paper.md
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Create virtual environment if it doesn't exist
|
||||
if [ ! -d "venv" ]; then
|
||||
python3 -m venv venv
|
||||
fi
|
||||
|
||||
# Activate virtual environment
|
||||
source venv/bin/activate
|
||||
|
||||
# Install dependencies
|
||||
pip install -e .
|
||||
|
||||
# Run tests if they exist
|
||||
if [ -d "tests" ]; then
|
||||
python -m unittest discover tests
|
||||
fi
|
||||
|
||||
echo "Build completed successfully!"
|
||||
@@ -0,0 +1 @@
|
||||
# Mistral OCR Python Package
|
||||
@@ -0,0 +1,67 @@
|
||||
import sys
|
||||
import argparse
|
||||
import os
|
||||
from mistral_ocr.commands import process, convert, markdown, version
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="A CLI tool for performing OCR on documents using Mistral AI.",
|
||||
prog="mistral-ocr"
|
||||
)
|
||||
parser.add_argument("--api-key", help="Mistral API key (defaults to MISTRAL_API_KEY env variable)")
|
||||
|
||||
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
|
||||
|
||||
# Process command
|
||||
process_parser = subparsers.add_parser("process", help="Process a document with OCR")
|
||||
process_parser.add_argument("file", help="File path or URL to process")
|
||||
process_parser.add_argument("-o", "--output-file", help="Output JSON file path (default is stdout)")
|
||||
process_parser.add_argument("--include-images", action="store_true", help="Include base64 encoded images in the output")
|
||||
|
||||
# Convert command
|
||||
convert_parser = subparsers.add_parser("convert", help="Convert OCR JSON output to Markdown")
|
||||
convert_parser.add_argument("json_file", help="JSON file to convert")
|
||||
convert_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files")
|
||||
convert_parser.add_argument("-o", "--output-file", help="Output filename for single file mode (default: document.md)")
|
||||
convert_parser.add_argument("--images", action="store_true", help="Include images in markdown (if available)")
|
||||
convert_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages")
|
||||
convert_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title")
|
||||
convert_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page")
|
||||
|
||||
# Markdown command
|
||||
markdown_parser = subparsers.add_parser("markdown", help="Process document and convert to markdown in one step")
|
||||
markdown_parser.add_argument("file_or_url", help="File path or URL to process")
|
||||
markdown_parser.add_argument("-j", "--json-file", help="Save intermediate JSON to file (optional)")
|
||||
markdown_parser.add_argument("-d", "--output-dir", default="markdown_output", help="Directory to store markdown files")
|
||||
markdown_parser.add_argument("-o", "--output-file", help="Path for output markdown file (implies --single-file)")
|
||||
markdown_parser.add_argument("--images", action="store_true", help="Include extracted images in markdown (if available)")
|
||||
markdown_parser.add_argument("--page-breaks", action="store_true", default=True, help="Include page break indicators between pages")
|
||||
markdown_parser.add_argument("--title-from-filename", action="store_true", default=True, help="Use filename as document title")
|
||||
markdown_parser.add_argument("--single-file", action="store_true", help="Create a single markdown file instead of one per page")
|
||||
|
||||
# Version command
|
||||
subparsers.add_parser("version", help="Print the version number")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set API key from args or environment
|
||||
if args.api_key:
|
||||
os.environ["MISTRAL_API_KEY"] = args.api_key
|
||||
|
||||
# Execute the appropriate command
|
||||
if args.command == "process":
|
||||
process.run(args)
|
||||
elif args.command == "convert":
|
||||
convert.run(args)
|
||||
elif args.command == "markdown":
|
||||
markdown.run(args)
|
||||
elif args.command == "version":
|
||||
version.run(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,153 @@
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import requests
|
||||
from typing import Optional, Dict, Any, Tuple
|
||||
|
||||
class MistralClient:
|
||||
BASE_URL = "https://api.mistral.ai/v1"
|
||||
MAX_FILE_SIZE = 52 * 1024 * 1024 # 52 MB
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None):
|
||||
self.api_key = api_key or os.environ.get("MISTRAL_API_KEY")
|
||||
if not self.api_key:
|
||||
raise ValueError("API key must be provided or set as MISTRAL_API_KEY environment variable")
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Accept": "application/json"
|
||||
})
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload a file to Mistral API for OCR processing."""
|
||||
# Check file size
|
||||
file_size = os.path.getsize(file_path)
|
||||
if file_size > self.MAX_FILE_SIZE:
|
||||
raise ValueError(f"File is too large ({file_size/1024/1024:.2f} MB). Maximum allowed size is {self.MAX_FILE_SIZE/1024/1024:.2f} MB")
|
||||
|
||||
# Retry logic
|
||||
max_retries = 3
|
||||
retry_delay = 3
|
||||
last_error = None
|
||||
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
files = {'file': f}
|
||||
data = {'purpose': 'ocr'}
|
||||
response = self.session.post(
|
||||
f"{self.BASE_URL}/files",
|
||||
files=files,
|
||||
data=data
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
if not response.content:
|
||||
last_error = ValueError("Received empty response from API")
|
||||
time.sleep(retry_delay)
|
||||
continue
|
||||
|
||||
file_response = response.json()
|
||||
file_id = file_response.get('id')
|
||||
|
||||
if not file_id:
|
||||
last_error = ValueError("Received response without file ID")
|
||||
time.sleep(retry_delay)
|
||||
continue
|
||||
|
||||
return file_id
|
||||
|
||||
except requests.RequestException as e:
|
||||
last_error = e
|
||||
# Retry on server errors or rate limiting
|
||||
if hasattr(e, 'response') and e.response is not None:
|
||||
status_code = e.response.status_code
|
||||
if status_code >= 500 or status_code == 429:
|
||||
time.sleep(retry_delay)
|
||||
continue
|
||||
raise
|
||||
|
||||
raise last_error or ValueError(f"Failed to upload file after {max_retries} attempts")
|
||||
|
||||
def get_file_url(self, file_id: str) -> str:
|
||||
"""Get a signed URL for an uploaded file."""
|
||||
response = self.session.get(f"{self.BASE_URL}/files/{file_id}/url?expiry=24")
|
||||
response.raise_for_status()
|
||||
|
||||
url_response = response.json()
|
||||
url = url_response.get('url')
|
||||
|
||||
if not url:
|
||||
raise ValueError("API response did not contain a URL")
|
||||
|
||||
return url
|
||||
|
||||
def process_ocr(self, doc_type: str, doc_source: str, include_image_base64: bool = False) -> bytes:
|
||||
"""Process a document with OCR."""
|
||||
if doc_type not in ["document_url", "image_url"]:
|
||||
raise ValueError(f"Unsupported document type: {doc_type}")
|
||||
|
||||
document_map = {"type": doc_type}
|
||||
if doc_type == "document_url":
|
||||
document_map["document_url"] = doc_source
|
||||
elif doc_type == "image_url":
|
||||
document_map["image_url"] = doc_source
|
||||
|
||||
request_body = {
|
||||
"model": "mistral-ocr-latest",
|
||||
"document": document_map,
|
||||
"include_image_base64": include_image_base64
|
||||
}
|
||||
|
||||
# Retry logic
|
||||
max_retries = 5
|
||||
retry_delay = 10
|
||||
last_error = None
|
||||
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
response = self.session.post(
|
||||
f"{self.BASE_URL}/ocr",
|
||||
json=request_body,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
# Check for non-200 status codes
|
||||
if response.status_code != 200:
|
||||
error_msg = response.text or response.reason
|
||||
|
||||
# Retry on server errors or rate limiting
|
||||
if response.status_code >= 500 or response.status_code == 429:
|
||||
last_error = ValueError(f"API returned error status: {response.status_code} - {error_msg}")
|
||||
time.sleep(retry_delay)
|
||||
continue
|
||||
|
||||
# For other errors, don't retry
|
||||
raise ValueError(f"API returned error status: {response.status_code} - {error_msg}")
|
||||
|
||||
# Check for empty response
|
||||
if not response.content:
|
||||
last_error = ValueError("Received empty response from API")
|
||||
adjusted_delay = retry_delay * attempt
|
||||
time.sleep(adjusted_delay)
|
||||
continue
|
||||
|
||||
# Check if response is valid JSON
|
||||
try:
|
||||
json.loads(response.content)
|
||||
except json.JSONDecodeError:
|
||||
last_error = ValueError("Received invalid JSON response from API")
|
||||
time.sleep(retry_delay)
|
||||
continue
|
||||
|
||||
# If we got here, we have a valid response
|
||||
return response.content
|
||||
|
||||
except requests.RequestException as e:
|
||||
last_error = e
|
||||
time.sleep(retry_delay)
|
||||
continue
|
||||
|
||||
raise last_error or ValueError(f"Failed after {max_retries} attempts")
|
||||
@@ -0,0 +1 @@
|
||||
# Commands package initialization
|
||||
@@ -0,0 +1,193 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
def run(args):
|
||||
# If output file is specified, enable single file mode
|
||||
if args.output_file:
|
||||
args.single_file = True
|
||||
|
||||
convert_json_to_markdown(args.json_file, args)
|
||||
|
||||
# OCR response structure classes
|
||||
class OCRResponseImage:
|
||||
def __init__(self, id, image_base64):
|
||||
self.id = id
|
||||
self.image_base64 = image_base64
|
||||
|
||||
class OCRResponsePage:
|
||||
def __init__(self, index, markdown, image=None, images=None, dimensions=None):
|
||||
self.index = index
|
||||
self.markdown = markdown
|
||||
self.image = image
|
||||
self.images = images or []
|
||||
self.dimensions = dimensions
|
||||
|
||||
class OCRResponseMetadata:
|
||||
def __init__(self, title=None, author=None, creation_date=None, page_count=None):
|
||||
self.title = title
|
||||
self.author = author
|
||||
self.creation_date = creation_date
|
||||
self.page_count = page_count
|
||||
|
||||
class OCRResponse:
|
||||
def __init__(self, pages=None, metadata=None):
|
||||
self.pages = pages or []
|
||||
self.metadata = metadata or OCRResponseMetadata()
|
||||
|
||||
def replace_image_references(content, images, include_images):
|
||||
if not include_images or not images:
|
||||
return content
|
||||
|
||||
# Create a map of image IDs to their base64 data
|
||||
image_map = {}
|
||||
for img in images:
|
||||
if img.image_base64:
|
||||
img_data = img.image_base64
|
||||
if not img_data.startswith("data:"):
|
||||
img_data = "data:image/jpeg;base64," + img_data
|
||||
image_map[img.id] = img_data
|
||||
|
||||
# Replace all image references with base64 data
|
||||
for id, base64_data in image_map.items():
|
||||
# Escape special characters in the ID for regex
|
||||
escaped_id = re.escape(id)
|
||||
pattern = f"!\\[{escaped_id}\\]\\({escaped_id}\\)"
|
||||
replacement = f""
|
||||
|
||||
content = re.sub(pattern, replacement, content)
|
||||
|
||||
return content
|
||||
|
||||
def convert_json_to_markdown(json_file, args):
|
||||
try:
|
||||
# Read JSON file
|
||||
with open(json_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Parse JSON into our structure
|
||||
ocr_response = OCRResponse()
|
||||
|
||||
# Parse pages
|
||||
if "pages" in data:
|
||||
for page_data in data["pages"]:
|
||||
page = OCRResponsePage(
|
||||
index=page_data.get("index", 0),
|
||||
markdown=page_data.get("markdown", ""),
|
||||
image=page_data.get("image", "")
|
||||
)
|
||||
|
||||
# Parse images if present
|
||||
if "images" in page_data:
|
||||
for img_data in page_data["images"]:
|
||||
page.images.append(OCRResponseImage(
|
||||
id=img_data.get("id", ""),
|
||||
image_base64=img_data.get("image_base64", "")
|
||||
))
|
||||
|
||||
ocr_response.pages.append(page)
|
||||
|
||||
# Parse metadata
|
||||
if "metadata" in data:
|
||||
metadata = data["metadata"]
|
||||
ocr_response.metadata = OCRResponseMetadata(
|
||||
title=metadata.get("title", ""),
|
||||
author=metadata.get("author", ""),
|
||||
creation_date=metadata.get("creation_date", ""),
|
||||
page_count=metadata.get("page_count", 0)
|
||||
)
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
if args.single_file:
|
||||
# Process all pages into a single markdown file
|
||||
combined = []
|
||||
title = "Document"
|
||||
|
||||
# Use metadata title if available
|
||||
if ocr_response.metadata.title:
|
||||
title = ocr_response.metadata.title
|
||||
elif args.title_from_filename:
|
||||
# Use filename without extension
|
||||
title = Path(json_file).stem
|
||||
|
||||
combined.append(f"# {title}\n")
|
||||
|
||||
# Add metadata if available
|
||||
if (ocr_response.metadata.author or
|
||||
ocr_response.metadata.creation_date or
|
||||
ocr_response.metadata.page_count):
|
||||
combined.append("## Document Metadata\n")
|
||||
if ocr_response.metadata.author:
|
||||
combined.append(f"**Author:** {ocr_response.metadata.author}\n")
|
||||
if ocr_response.metadata.creation_date:
|
||||
combined.append(f"**Creation Date:** {ocr_response.metadata.creation_date}\n")
|
||||
if ocr_response.metadata.page_count:
|
||||
combined.append(f"**Page Count:** {ocr_response.metadata.page_count}\n")
|
||||
combined.append("\n")
|
||||
|
||||
# Process each page
|
||||
for i, page in enumerate(ocr_response.pages):
|
||||
# Add page header
|
||||
combined.append(f"## Page {page.index + 1}\n")
|
||||
|
||||
# Convert page images to OCRResponseImage format
|
||||
page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
|
||||
|
||||
# Replace image references in markdown content if includeImages is true
|
||||
page_content = page.markdown
|
||||
if args.images:
|
||||
page_content = replace_image_references(page_content, page_images, args.images)
|
||||
|
||||
# Add page content
|
||||
combined.append(page_content)
|
||||
combined.append("\n")
|
||||
|
||||
# Add page separator if not the last page
|
||||
if args.page_breaks and i < len(ocr_response.pages) - 1:
|
||||
combined.append("\n---\n")
|
||||
|
||||
# Write combined markdown file
|
||||
# Use custom filename if provided, otherwise use default
|
||||
filename = "document.md"
|
||||
if args.output_file:
|
||||
# If output_file contains directory components, ensure they exist
|
||||
output_path = Path(args.output_dir) / args.output_file
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
filename = args.output_file
|
||||
else:
|
||||
output_path = Path(args.output_dir) / filename
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write("\n".join(combined))
|
||||
|
||||
print(f"Created single markdown file: {output_path}")
|
||||
else:
|
||||
# Process each page into a separate file
|
||||
for page in ocr_response.pages:
|
||||
# Use page index as the filename
|
||||
filename = f"{page.index}.md"
|
||||
output_path = Path(args.output_dir) / filename
|
||||
|
||||
# Convert page images to OCRResponseImage format
|
||||
page_images = [OCRResponseImage(img.id, img.image_base64) for img in page.images]
|
||||
|
||||
# Get page content with image references replaced if needed
|
||||
markdown_content = page.markdown
|
||||
if args.images:
|
||||
markdown_content = replace_image_references(markdown_content, page_images, args.images)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
print(f"Created markdown file: {output_path}")
|
||||
|
||||
print(f"Successfully converted {json_file} to markdown files in {args.output_dir}/")
|
||||
print(f"Total pages: {len(ocr_response.pages)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error converting JSON to markdown: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,44 @@
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from mistral_ocr.commands import process, convert
|
||||
|
||||
def run(args):
|
||||
# Ensure that if --images is true, include_image_base64 is also true
|
||||
include_image_base64 = args.images
|
||||
|
||||
# If output file is specified, enable single file mode
|
||||
if args.output_file:
|
||||
args.single_file = True
|
||||
|
||||
# Create temporary file for JSON output if not specified
|
||||
json_output_path = args.json_file
|
||||
temp_file = None
|
||||
|
||||
if not json_output_path:
|
||||
temp_file = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
|
||||
json_output_path = temp_file.name
|
||||
temp_file.close()
|
||||
|
||||
try:
|
||||
# Step 1: Process the document
|
||||
if args.file_or_url.startswith(("http://", "https://")):
|
||||
process.process_url(args.file_or_url, json_output_path, include_image_base64)
|
||||
else:
|
||||
process.process_local_file(args.file_or_url, json_output_path, include_image_base64)
|
||||
|
||||
# Step 2: Convert the JSON to markdown
|
||||
print("Converting JSON to Markdown...")
|
||||
convert.convert_json_to_markdown(json_output_path, args)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing and converting document: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
# Clean up temporary file if we created one
|
||||
if temp_file and not args.json_file:
|
||||
try:
|
||||
os.unlink(temp_file.name)
|
||||
except:
|
||||
pass
|
||||
@@ -0,0 +1,89 @@
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import urllib.parse
|
||||
from mistral_ocr.client import MistralClient
|
||||
|
||||
def run(args):
|
||||
file_path = args.file
|
||||
|
||||
# Determine if input is a URL or a local file
|
||||
if file_path.startswith(("http://", "https://")):
|
||||
process_url(file_path, args.output_file, args.include_images)
|
||||
else:
|
||||
process_local_file(file_path, args.output_file, args.include_images)
|
||||
|
||||
def process_url(url, output_file, include_image_base64):
|
||||
try:
|
||||
client = MistralClient()
|
||||
|
||||
# Determine the document type based on URL
|
||||
doc_type = "document_url"
|
||||
url_lower = url.lower()
|
||||
if any(url_lower.endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".webp", ".gif"]):
|
||||
doc_type = "image_url"
|
||||
|
||||
# Process the document
|
||||
resp_data = client.process_ocr(doc_type, url, include_image_base64)
|
||||
|
||||
# Handle the output
|
||||
handle_output(resp_data, output_file)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing document: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
def process_local_file(file_path, output_file, include_image_base64):
|
||||
try:
|
||||
print(f"Processing local file: {file_path}")
|
||||
|
||||
# Check if file exists
|
||||
if not os.path.exists(file_path):
|
||||
print(f"Error: file '{file_path}' does not exist", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
client = MistralClient()
|
||||
|
||||
# Upload the file to Mistral API
|
||||
file_id = client.upload_file(file_path)
|
||||
print(f"File uploaded successfully with ID: {file_id}")
|
||||
|
||||
# Get the signed file URL for processing
|
||||
file_url = client.get_file_url(file_id)
|
||||
|
||||
# Determine the document type based on file extension
|
||||
doc_type = "document_url"
|
||||
file_path_lower = file_path.lower()
|
||||
if any(file_path_lower.endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".webp", ".gif"]):
|
||||
doc_type = "image_url"
|
||||
|
||||
print(f"Processing with signed file URL (type: {doc_type})")
|
||||
|
||||
# Process the uploaded file with the appropriate type
|
||||
resp_data = client.process_ocr(doc_type, file_url, include_image_base64)
|
||||
|
||||
# Handle the output
|
||||
handle_output(resp_data, output_file)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing document: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
def handle_output(data, output_file):
|
||||
# Pretty print the JSON response
|
||||
pretty_json = json.dumps(json.loads(data), indent=2)
|
||||
|
||||
# Write to output file or stdout
|
||||
if output_file:
|
||||
# Create directory if it doesn't exist
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write the file
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(pretty_json)
|
||||
print(f"OCR results saved to {output_file}")
|
||||
else:
|
||||
# Write to stdout
|
||||
print(pretty_json)
|
||||
@@ -0,0 +1,6 @@
|
||||
import sys
|
||||
|
||||
VERSION = "0.1.0"
|
||||
|
||||
def run(args):
|
||||
print(f"Mistral OCR CLI v{VERSION}")
|
||||
@@ -0,0 +1 @@
|
||||
requests>=2.25.0
|
||||
@@ -0,0 +1,28 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name="mistral-ocr",
|
||||
version="0.1.0",
|
||||
description="A CLI tool for performing OCR on documents using Mistral AI",
|
||||
author="Mistral OCR Team",
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
"requests>=2.25.0",
|
||||
],
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"mistral-ocr=mistral_ocr.__main__:main",
|
||||
],
|
||||
},
|
||||
classifiers=[
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
],
|
||||
python_requires=">=3.7",
|
||||
)
|
||||
Reference in New Issue
Block a user