Add comprehensive documentation and code comments

This commit adds extensive documentation to the Mistral OCR CLI project:

- Add API.md with detailed API response format documentation
- Add CHANGELOG.md to track version changes
- Add CONTRIBUTING.md with guidelines for contributors
- Enhance README.md with more detailed usage examples and troubleshooting
- Add proper docstrings to all Python modules and functions
- Update requirements.txt with development dependencies
- Improve setup.py with better metadata

These changes make the project more accessible to users and contributors.
This commit is contained in:
2025-04-24 21:11:41 +02:00
parent 240d64023b
commit 5e891ef461
13 changed files with 786 additions and 15 deletions
+17 -1
View File
@@ -1 +1,17 @@
# Mistral OCR Python Package
"""
Mistral OCR Python Package
A command-line tool for processing documents with Mistral AI's OCR capabilities.
This package provides functionality for extracting text and structured content
from PDF documents and images while preserving the original formatting and layout.
Main components:
- Client for interacting with the Mistral AI OCR API
- Commands for processing documents, converting results to Markdown, and more
- Utilities for handling file operations and formatting
For usage information, see the README.md file or run:
mistral-ocr --help
"""
__version__ = "0.1.0"
+8
View File
@@ -4,6 +4,14 @@ import os
from mistral_ocr.commands import process, convert, markdown, version
def main():
"""
Main entry point for the Mistral OCR CLI.
Parses command line arguments and dispatches to the appropriate command handler.
Returns:
int: Exit code (0 for success, 1 for error)
"""
parser = argparse.ArgumentParser(
description="A CLI tool for performing OCR on documents using Mistral AI.",
prog="mistral-ocr"
+65 -3
View File
@@ -5,10 +5,33 @@ import requests
from typing import Optional, Dict, Any, Tuple
class MistralClient:
"""
Client for interacting with the Mistral AI OCR API.
This client handles authentication, file uploads, and OCR processing
requests to the Mistral AI API.
Attributes:
BASE_URL (str): Base URL for the Mistral AI API
MAX_FILE_SIZE (int): Maximum allowed file size in bytes (52 MB)
api_key (str): Mistral AI API key for authentication
session (requests.Session): Session object for making HTTP requests
"""
BASE_URL = "https://api.mistral.ai/v1"
MAX_FILE_SIZE = 52 * 1024 * 1024 # 52 MB
def __init__(self, api_key: Optional[str] = None):
"""
Initialize the Mistral AI client.
Args:
api_key (Optional[str]): Mistral AI API key. If not provided,
will look for MISTRAL_API_KEY environment variable.
Raises:
ValueError: If no API key is provided or found in environment variables.
"""
self.api_key = api_key or os.environ.get("MISTRAL_API_KEY")
if not self.api_key:
raise ValueError("API key must be provided or set as MISTRAL_API_KEY environment variable")
@@ -20,7 +43,19 @@ class MistralClient:
})
def upload_file(self, file_path: str) -> str:
"""Upload a file to Mistral API for OCR processing."""
"""
Upload a file to Mistral API for OCR processing.
Args:
file_path (str): Path to the local file to upload
Returns:
str: File ID returned by the API
Raises:
ValueError: If the file is too large or if the upload fails
requests.RequestException: If there's an error communicating with the API
"""
# Check file size
file_size = os.path.getsize(file_path)
if file_size > self.MAX_FILE_SIZE:
@@ -72,7 +107,19 @@ class MistralClient:
raise last_error or ValueError(f"Failed to upload file after {max_retries} attempts")
def get_file_url(self, file_id: str) -> str:
"""Get a signed URL for an uploaded file."""
"""
Get a signed URL for an uploaded file.
Args:
file_id (str): ID of the file previously uploaded to the API
Returns:
str: Signed URL that can be used for OCR processing
Raises:
ValueError: If the API response does not contain a URL
requests.RequestException: If there's an error communicating with the API
"""
response = self.session.get(f"{self.BASE_URL}/files/{file_id}/url?expiry=24")
response.raise_for_status()
@@ -85,7 +132,22 @@ class MistralClient:
return url
def process_ocr(self, doc_type: str, doc_source: str, include_image_base64: bool = False) -> bytes:
"""Process a document with OCR."""
"""
Process a document with OCR.
Args:
doc_type (str): Type of document, either "document_url" or "image_url"
doc_source (str): URL of the document to process
include_image_base64 (bool, optional): Whether to include base64-encoded
images in the response. Defaults to False.
Returns:
bytes: JSON response from the API containing OCR results
Raises:
ValueError: If the document type is unsupported or if processing fails
requests.RequestException: If there's an error communicating with the API
"""
if doc_type not in ["document_url", "image_url"]:
raise ValueError(f"Unsupported document type: {doc_type}")
+62
View File
@@ -5,6 +5,14 @@ import re
from pathlib import Path
def run(args):
"""
Main entry point for the convert command.
Converts OCR JSON results to Markdown format.
Args:
args: Command line arguments parsed by argparse
"""
# If output file is specified, enable single file mode
if args.output_file:
args.single_file = True
@@ -13,11 +21,28 @@ def run(args):
# OCR response structure classes
class OCRResponseImage:
"""
Represents an image in the OCR response.
Attributes:
id (str): Unique identifier for the image
image_base64 (str): Base64-encoded image data
"""
def __init__(self, id, image_base64):
self.id = id
self.image_base64 = image_base64
class OCRResponsePage:
"""
Represents a page in the OCR response.
Attributes:
index (int): Zero-based page index
markdown (str): Extracted text content in Markdown format
image (str, optional): Main page image (if available)
images (list): List of OCRResponseImage objects
dimensions (dict, optional): Page dimensions
"""
def __init__(self, index, markdown, image=None, images=None, dimensions=None):
self.index = index
self.markdown = markdown
@@ -26,6 +51,15 @@ class OCRResponsePage:
self.dimensions = dimensions
class OCRResponseMetadata:
"""
Represents metadata in the OCR response.
Attributes:
title (str, optional): Document title
author (str, optional): Document author
creation_date (str, optional): Document creation date
page_count (int, optional): Total number of pages
"""
def __init__(self, title=None, author=None, creation_date=None, page_count=None):
self.title = title
self.author = author
@@ -33,11 +67,29 @@ class OCRResponseMetadata:
self.page_count = page_count
class OCRResponse:
"""
Represents the complete OCR response.
Attributes:
pages (list): List of OCRResponsePage objects
metadata (OCRResponseMetadata): Document metadata
"""
def __init__(self, pages=None, metadata=None):
self.pages = pages or []
self.metadata = metadata or OCRResponseMetadata()
def replace_image_references(content, images, include_images):
"""
Replace image references in markdown content with base64 data.
Args:
content (str): Markdown content with image references
images (list): List of OCRResponseImage objects
include_images (bool): Whether to include images in the output
Returns:
str: Markdown content with image references replaced with base64 data
"""
if not include_images or not images:
return content
@@ -62,6 +114,16 @@ def replace_image_references(content, images, include_images):
return content
def convert_json_to_markdown(json_file, args):
"""
Convert OCR JSON results to Markdown format.
Args:
json_file (str): Path to the JSON file containing OCR results
args: Command line arguments containing conversion options
Raises:
SystemExit: If an error occurs during conversion
"""
try:
# Read JSON file
with open(json_file, 'r', encoding='utf-8') as f:
+13
View File
@@ -5,6 +5,19 @@ from pathlib import Path
from mistral_ocr.commands import process, convert
def run(args):
"""
Main entry point for the markdown command.
Processes a document with OCR and converts the results to Markdown in one step.
This is a convenience command that combines the functionality of the 'process'
and 'convert' commands.
Args:
args: Command line arguments parsed by argparse
Raises:
SystemExit: If an error occurs during processing or conversion
"""
# Ensure that if --images is true, include_image_base64 is also true
include_image_base64 = args.images
+37
View File
@@ -6,6 +6,14 @@ import urllib.parse
from mistral_ocr.client import MistralClient
def run(args):
"""
Main entry point for the process command.
Processes a document with OCR, either from a URL or a local file.
Args:
args: Command line arguments parsed by argparse
"""
file_path = args.file
# Determine if input is a URL or a local file
@@ -15,6 +23,17 @@ def run(args):
process_local_file(file_path, args.output_file, args.include_images)
def process_url(url, output_file, include_image_base64):
"""
Process a document from a URL.
Args:
url (str): URL of the document to process
output_file (str): Path to save the OCR results, or None for stdout
include_image_base64 (bool): Whether to include base64-encoded images in the output
Raises:
SystemExit: If an error occurs during processing
"""
try:
client = MistralClient()
@@ -35,6 +54,17 @@ def process_url(url, output_file, include_image_base64):
sys.exit(1)
def process_local_file(file_path, output_file, include_image_base64):
"""
Process a local document file.
Args:
file_path (str): Path to the local file to process
output_file (str): Path to save the OCR results, or None for stdout
include_image_base64 (bool): Whether to include base64-encoded images in the output
Raises:
SystemExit: If an error occurs during processing
"""
try:
print(f"Processing local file: {file_path}")
@@ -71,6 +101,13 @@ def process_local_file(file_path, output_file, include_image_base64):
sys.exit(1)
def handle_output(data, output_file):
"""
Handle the OCR response output.
Args:
data (bytes): JSON response data from the OCR API
output_file (str): Path to save the OCR results, or None for stdout
"""
# Pretty print the JSON response
pretty_json = json.dumps(json.loads(data), indent=2)
+10 -3
View File
@@ -1,6 +1,13 @@
import sys
VERSION = "0.1.0"
from mistral_ocr import __version__
def run(args):
print(f"Mistral OCR CLI v{VERSION}")
"""
Main entry point for the version command.
Prints the current version of the Mistral OCR CLI.
Args:
args: Command line arguments parsed by argparse (not used)
"""
print(f"Mistral OCR CLI v{__version__}")