Initial commit
This commit is contained in:
@@ -0,0 +1,89 @@
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import urllib.parse
|
||||
from mistral_ocr.client import MistralClient
|
||||
|
||||
def run(args):
|
||||
file_path = args.file
|
||||
|
||||
# Determine if input is a URL or a local file
|
||||
if file_path.startswith(("http://", "https://")):
|
||||
process_url(file_path, args.output_file, args.include_images)
|
||||
else:
|
||||
process_local_file(file_path, args.output_file, args.include_images)
|
||||
|
||||
def process_url(url, output_file, include_image_base64):
|
||||
try:
|
||||
client = MistralClient()
|
||||
|
||||
# Determine the document type based on URL
|
||||
doc_type = "document_url"
|
||||
url_lower = url.lower()
|
||||
if any(url_lower.endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".webp", ".gif"]):
|
||||
doc_type = "image_url"
|
||||
|
||||
# Process the document
|
||||
resp_data = client.process_ocr(doc_type, url, include_image_base64)
|
||||
|
||||
# Handle the output
|
||||
handle_output(resp_data, output_file)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing document: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
def process_local_file(file_path, output_file, include_image_base64):
|
||||
try:
|
||||
print(f"Processing local file: {file_path}")
|
||||
|
||||
# Check if file exists
|
||||
if not os.path.exists(file_path):
|
||||
print(f"Error: file '{file_path}' does not exist", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
client = MistralClient()
|
||||
|
||||
# Upload the file to Mistral API
|
||||
file_id = client.upload_file(file_path)
|
||||
print(f"File uploaded successfully with ID: {file_id}")
|
||||
|
||||
# Get the signed file URL for processing
|
||||
file_url = client.get_file_url(file_id)
|
||||
|
||||
# Determine the document type based on file extension
|
||||
doc_type = "document_url"
|
||||
file_path_lower = file_path.lower()
|
||||
if any(file_path_lower.endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".webp", ".gif"]):
|
||||
doc_type = "image_url"
|
||||
|
||||
print(f"Processing with signed file URL (type: {doc_type})")
|
||||
|
||||
# Process the uploaded file with the appropriate type
|
||||
resp_data = client.process_ocr(doc_type, file_url, include_image_base64)
|
||||
|
||||
# Handle the output
|
||||
handle_output(resp_data, output_file)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing document: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
def handle_output(data, output_file):
|
||||
# Pretty print the JSON response
|
||||
pretty_json = json.dumps(json.loads(data), indent=2)
|
||||
|
||||
# Write to output file or stdout
|
||||
if output_file:
|
||||
# Create directory if it doesn't exist
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write the file
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(pretty_json)
|
||||
print(f"OCR results saved to {output_file}")
|
||||
else:
|
||||
# Write to stdout
|
||||
print(pretty_json)
|
||||
Reference in New Issue
Block a user