Files
mistral-ocr/mistral_ocr/client.py
T
2025-04-24 20:54:50 +02:00

154 lines
5.9 KiB
Python

import os
import json
import time
import requests
from typing import Optional, Dict, Any, Tuple
class MistralClient:
BASE_URL = "https://api.mistral.ai/v1"
MAX_FILE_SIZE = 52 * 1024 * 1024 # 52 MB
def __init__(self, api_key: Optional[str] = None):
self.api_key = api_key or os.environ.get("MISTRAL_API_KEY")
if not self.api_key:
raise ValueError("API key must be provided or set as MISTRAL_API_KEY environment variable")
self.session = requests.Session()
self.session.headers.update({
"Authorization": f"Bearer {self.api_key}",
"Accept": "application/json"
})
def upload_file(self, file_path: str) -> str:
"""Upload a file to Mistral API for OCR processing."""
# Check file size
file_size = os.path.getsize(file_path)
if file_size > self.MAX_FILE_SIZE:
raise ValueError(f"File is too large ({file_size/1024/1024:.2f} MB). Maximum allowed size is {self.MAX_FILE_SIZE/1024/1024:.2f} MB")
# Retry logic
max_retries = 3
retry_delay = 3
last_error = None
for attempt in range(1, max_retries + 1):
try:
with open(file_path, 'rb') as f:
files = {'file': f}
data = {'purpose': 'ocr'}
response = self.session.post(
f"{self.BASE_URL}/files",
files=files,
data=data
)
response.raise_for_status()
if not response.content:
last_error = ValueError("Received empty response from API")
time.sleep(retry_delay)
continue
file_response = response.json()
file_id = file_response.get('id')
if not file_id:
last_error = ValueError("Received response without file ID")
time.sleep(retry_delay)
continue
return file_id
except requests.RequestException as e:
last_error = e
# Retry on server errors or rate limiting
if hasattr(e, 'response') and e.response is not None:
status_code = e.response.status_code
if status_code >= 500 or status_code == 429:
time.sleep(retry_delay)
continue
raise
raise last_error or ValueError(f"Failed to upload file after {max_retries} attempts")
def get_file_url(self, file_id: str) -> str:
"""Get a signed URL for an uploaded file."""
response = self.session.get(f"{self.BASE_URL}/files/{file_id}/url?expiry=24")
response.raise_for_status()
url_response = response.json()
url = url_response.get('url')
if not url:
raise ValueError("API response did not contain a URL")
return url
def process_ocr(self, doc_type: str, doc_source: str, include_image_base64: bool = False) -> bytes:
"""Process a document with OCR."""
if doc_type not in ["document_url", "image_url"]:
raise ValueError(f"Unsupported document type: {doc_type}")
document_map = {"type": doc_type}
if doc_type == "document_url":
document_map["document_url"] = doc_source
elif doc_type == "image_url":
document_map["image_url"] = doc_source
request_body = {
"model": "mistral-ocr-latest",
"document": document_map,
"include_image_base64": include_image_base64
}
# Retry logic
max_retries = 5
retry_delay = 10
last_error = None
for attempt in range(1, max_retries + 1):
try:
response = self.session.post(
f"{self.BASE_URL}/ocr",
json=request_body,
headers={"Content-Type": "application/json"}
)
# Check for non-200 status codes
if response.status_code != 200:
error_msg = response.text or response.reason
# Retry on server errors or rate limiting
if response.status_code >= 500 or response.status_code == 429:
last_error = ValueError(f"API returned error status: {response.status_code} - {error_msg}")
time.sleep(retry_delay)
continue
# For other errors, don't retry
raise ValueError(f"API returned error status: {response.status_code} - {error_msg}")
# Check for empty response
if not response.content:
last_error = ValueError("Received empty response from API")
adjusted_delay = retry_delay * attempt
time.sleep(adjusted_delay)
continue
# Check if response is valid JSON
try:
json.loads(response.content)
except json.JSONDecodeError:
last_error = ValueError("Received invalid JSON response from API")
time.sleep(retry_delay)
continue
# If we got here, we have a valid response
return response.content
except requests.RequestException as e:
last_error = e
time.sleep(retry_delay)
continue
raise last_error or ValueError(f"Failed after {max_retries} attempts")