import os import json import time import requests from typing import Optional, Dict, Any, Tuple class MistralClient: BASE_URL = "https://api.mistral.ai/v1" MAX_FILE_SIZE = 52 * 1024 * 1024 # 52 MB def __init__(self, api_key: Optional[str] = None): self.api_key = api_key or os.environ.get("MISTRAL_API_KEY") if not self.api_key: raise ValueError("API key must be provided or set as MISTRAL_API_KEY environment variable") self.session = requests.Session() self.session.headers.update({ "Authorization": f"Bearer {self.api_key}", "Accept": "application/json" }) def upload_file(self, file_path: str) -> str: """Upload a file to Mistral API for OCR processing.""" # Check file size file_size = os.path.getsize(file_path) if file_size > self.MAX_FILE_SIZE: raise ValueError(f"File is too large ({file_size/1024/1024:.2f} MB). Maximum allowed size is {self.MAX_FILE_SIZE/1024/1024:.2f} MB") # Retry logic max_retries = 3 retry_delay = 3 last_error = None for attempt in range(1, max_retries + 1): try: with open(file_path, 'rb') as f: files = {'file': f} data = {'purpose': 'ocr'} response = self.session.post( f"{self.BASE_URL}/files", files=files, data=data ) response.raise_for_status() if not response.content: last_error = ValueError("Received empty response from API") time.sleep(retry_delay) continue file_response = response.json() file_id = file_response.get('id') if not file_id: last_error = ValueError("Received response without file ID") time.sleep(retry_delay) continue return file_id except requests.RequestException as e: last_error = e # Retry on server errors or rate limiting if hasattr(e, 'response') and e.response is not None: status_code = e.response.status_code if status_code >= 500 or status_code == 429: time.sleep(retry_delay) continue raise raise last_error or ValueError(f"Failed to upload file after {max_retries} attempts") def get_file_url(self, file_id: str) -> str: """Get a signed URL for an uploaded file.""" response = self.session.get(f"{self.BASE_URL}/files/{file_id}/url?expiry=24") response.raise_for_status() url_response = response.json() url = url_response.get('url') if not url: raise ValueError("API response did not contain a URL") return url def process_ocr(self, doc_type: str, doc_source: str, include_image_base64: bool = False) -> bytes: """Process a document with OCR.""" if doc_type not in ["document_url", "image_url"]: raise ValueError(f"Unsupported document type: {doc_type}") document_map = {"type": doc_type} if doc_type == "document_url": document_map["document_url"] = doc_source elif doc_type == "image_url": document_map["image_url"] = doc_source request_body = { "model": "mistral-ocr-latest", "document": document_map, "include_image_base64": include_image_base64 } # Retry logic max_retries = 5 retry_delay = 10 last_error = None for attempt in range(1, max_retries + 1): try: response = self.session.post( f"{self.BASE_URL}/ocr", json=request_body, headers={"Content-Type": "application/json"} ) # Check for non-200 status codes if response.status_code != 200: error_msg = response.text or response.reason # Retry on server errors or rate limiting if response.status_code >= 500 or response.status_code == 429: last_error = ValueError(f"API returned error status: {response.status_code} - {error_msg}") time.sleep(retry_delay) continue # For other errors, don't retry raise ValueError(f"API returned error status: {response.status_code} - {error_msg}") # Check for empty response if not response.content: last_error = ValueError("Received empty response from API") adjusted_delay = retry_delay * attempt time.sleep(adjusted_delay) continue # Check if response is valid JSON try: json.loads(response.content) except json.JSONDecodeError: last_error = ValueError("Received invalid JSON response from API") time.sleep(retry_delay) continue # If we got here, we have a valid response return response.content except requests.RequestException as e: last_error = e time.sleep(retry_delay) continue raise last_error or ValueError(f"Failed after {max_retries} attempts")