From 5e891ef461f2dccd4c33e34c8e36fdbc5a2e7443 Mon Sep 17 00:00:00 2001 From: Heiko Joerg Schick Date: Thu, 24 Apr 2025 21:11:41 +0200 Subject: [PATCH] Add comprehensive documentation and code comments This commit adds extensive documentation to the Mistral OCR CLI project: - Add API.md with detailed API response format documentation - Add CHANGELOG.md to track version changes - Add CONTRIBUTING.md with guidelines for contributors - Enhance README.md with more detailed usage examples and troubleshooting - Add proper docstrings to all Python modules and functions - Update requirements.txt with development dependencies - Improve setup.py with better metadata These changes make the project more accessible to users and contributors. --- API.md | 217 +++++++++++++++++++++++++++++++ CHANGELOG.md | 26 ++++ CONTRIBUTING.md | 171 ++++++++++++++++++++++++ README.md | 142 +++++++++++++++++++- mistral_ocr/__init__.py | 18 ++- mistral_ocr/__main__.py | 8 ++ mistral_ocr/client.py | 68 +++++++++- mistral_ocr/commands/convert.py | 62 +++++++++ mistral_ocr/commands/markdown.py | 13 ++ mistral_ocr/commands/process.py | 37 ++++++ mistral_ocr/commands/version.py | 13 +- requirements.txt | 8 ++ setup.py | 18 ++- 13 files changed, 786 insertions(+), 15 deletions(-) create mode 100644 API.md create mode 100644 CHANGELOG.md create mode 100644 CONTRIBUTING.md diff --git a/API.md b/API.md new file mode 100644 index 0000000..6d06b2b --- /dev/null +++ b/API.md @@ -0,0 +1,217 @@ +# Mistral OCR API Documentation + +This document provides detailed information about the Mistral OCR API response format and how to work with it in your applications. + +## Table of Contents + +- [Mistral OCR API Documentation](#mistral-ocr-api-documentation) + - [Table of Contents](#table-of-contents) + - [API Response Format](#api-response-format) + - [Document Metadata](#document-metadata) + - [Pages](#pages) + - [Images](#images) + - [Working with the API Response](#working-with-the-api-response) + - [Parsing the JSON Response](#parsing-the-json-response) + - [Handling Images](#handling-images) + - [Working with Markdown Content](#working-with-markdown-content) + - [Error Handling](#error-handling) + - [API Key Errors](#api-key-errors) + - [File Size Errors](#file-size-errors) + - [Rate Limiting](#rate-limiting) + - [API Limitations](#api-limitations) + +## API Response Format + +The Mistral OCR API returns a JSON response with the following structure: + +```json +{ + "metadata": { + "title": "Document Title", + "author": "Document Author", + "creation_date": "2023-01-01", + "page_count": 5 + }, + "pages": [ + { + "index": 0, + "markdown": "# Page Content\n\nThis is the content of page 1...", + "images": [ + { + "id": "image-1", + "image_base64": "base64-encoded-image-data" + } + ] + }, + { + "index": 1, + "markdown": "## Page 2 Content\n\nThis is the content of page 2...", + "images": [] + } + ] +} +``` + +### Document Metadata + +The `metadata` object contains document-level information: + +| Field | Type | Description | +|-------|------|-------------| +| `title` | String | The document title, if available | +| `author` | String | The document author, if available | +| `creation_date` | String | The document creation date in ISO format (YYYY-MM-DD), if available | +| `page_count` | Integer | The total number of pages in the document | + +Note that some metadata fields may be empty or missing if the information cannot be extracted from the document. + +### Pages + +The `pages` array contains objects representing each page in the document: + +| Field | Type | Description | +|-------|------|-------------| +| `index` | Integer | Zero-based page index | +| `markdown` | String | The extracted text content in Markdown format | +| `images` | Array | An array of image objects found on the page | + +### Images + +Each image object in the `images` array has the following structure: + +| Field | Type | Description | +|-------|------|-------------| +| `id` | String | A unique identifier for the image | +| `image_base64` | String | Base64-encoded image data (only included if `include_images` is specified) | + +## Working with the API Response + +### Parsing the JSON Response + +Here's an example of how to parse the JSON response in Python: + +```python +import json + +# Load the JSON response +with open('ocr_results.json', 'r') as f: + ocr_data = json.load(f) + +# Access metadata +title = ocr_data.get('metadata', {}).get('title', 'Untitled Document') +page_count = ocr_data.get('metadata', {}).get('page_count', 0) + +# Access page content +for page in ocr_data.get('pages', []): + page_index = page.get('index', 0) + page_content = page.get('markdown', '') + + print(f"Page {page_index + 1}:") + print(page_content) + print("-" * 40) +``` + +### Handling Images + +If you've included images in the response (using the `--include-images` flag), you can extract and save them: + +```python +import base64 +import os + +# Create a directory for images +os.makedirs('extracted_images', exist_ok=True) + +# Extract images from each page +for page in ocr_data.get('pages', []): + page_index = page.get('index', 0) + + for img_index, image in enumerate(page.get('images', [])): + img_id = image.get('id', f'unknown-{img_index}') + img_data = image.get('image_base64', '') + + if img_data: + # Remove data URL prefix if present + if ',' in img_data: + img_data = img_data.split(',', 1)[1] + + # Decode and save the image + img_bytes = base64.b64decode(img_data) + with open(f'extracted_images/page{page_index}_{img_id}.jpg', 'wb') as img_file: + img_file.write(img_bytes) +``` + +### Working with Markdown Content + +The OCR results are provided in Markdown format, which makes it easy to convert to other formats or display in applications: + +```python +import markdown + +# Convert markdown to HTML +for page in ocr_data.get('pages', []): + page_content = page.get('markdown', '') + html_content = markdown.markdown(page_content) + + # Now you can use the HTML content in your application + # For example, save it to an HTML file + with open(f'page_{page.get("index", 0)}.html', 'w') as f: + f.write(html_content) +``` + +## Error Handling + +When working with the API, you may encounter various errors. Here are some common error scenarios and how to handle them: + +### API Key Errors + +``` +API key must be provided or set as MISTRAL_API_KEY environment variable +``` + +Ensure your API key is correctly set as an environment variable or provided with the `--api-key` flag. + +### File Size Errors + +``` +File is too large (55.00 MB). Maximum allowed size is 52.00 MB +``` + +The Mistral API has a file size limit of 52MB. For larger files, consider splitting them into smaller documents. + +### Rate Limiting + +``` +API returned error status: 429 - Rate limit exceeded +``` + +The API has rate limits. Implement exponential backoff and retry logic in your application: + +```python +import time +import random + +def api_request_with_retry(func, max_retries=5, initial_delay=1): + retries = 0 + while retries < max_retries: + try: + return func() + except Exception as e: + if "429" in str(e) and retries < max_retries - 1: + # Exponential backoff with jitter + delay = initial_delay * (2 ** retries) + random.uniform(0, 1) + print(f"Rate limited. Retrying in {delay:.2f} seconds...") + time.sleep(delay) + retries += 1 + else: + raise +``` + +## API Limitations + +- **Maximum file size**: 52MB +- **Supported file formats**: PDF, JPG, JPEG, PNG, WEBP, GIF +- **Rate limits**: Depends on your Mistral AI account tier +- **Concurrent requests**: Depends on your Mistral AI account tier +- **Image extraction**: Some complex images or diagrams may not be perfectly extracted +- **Language support**: Check the Mistral AI documentation for the latest information on supported languages diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ce95f6b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,26 @@ +# Changelog + +All notable changes to the Mistral OCR CLI project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- Enhanced documentation including README.md, CONTRIBUTING.md, and CHANGELOG.md +- More detailed troubleshooting section +- API response format documentation + +## [0.1.0] - 2025-04-24 + +### Added +- Initial release of Mistral OCR CLI +- Process command for OCR processing of PDF documents and images +- Convert command for transforming OCR JSON to Markdown +- Markdown command for one-step processing and conversion +- Support for local files and URLs +- Support for extracting and including images +- Support for metadata extraction +- Support for single-file and multi-file output +- Basic error handling and retries diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..0b5780e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,171 @@ +# Contributing to Mistral OCR CLI + +Thank you for your interest in contributing to Mistral OCR CLI! This document provides guidelines and instructions for contributing to this project. + +## Table of Contents + +- [Contributing to Mistral OCR CLI](#contributing-to-mistral-ocr-cli) + - [Table of Contents](#table-of-contents) + - [Code of Conduct](#code-of-conduct) + - [Getting Started](#getting-started) + - [Development Environment Setup](#development-environment-setup) + - [Project Structure](#project-structure) + - [Development Workflow](#development-workflow) + - [Creating a Feature](#creating-a-feature) + - [Testing](#testing) + - [Documentation](#documentation) + - [Pull Request Process](#pull-request-process) + - [Coding Standards](#coding-standards) + - [Release Process](#release-process) + +## Code of Conduct + +Please be respectful and considerate of others when contributing to this project. We aim to foster an inclusive and welcoming community. + +## Getting Started + +### Development Environment Setup + +1. **Fork and clone the repository**: + ```bash + git clone https://github.com/yourusername/mistral-ocr-python.git + cd mistral-ocr-python + ``` + +2. **Create a virtual environment**: + ```bash + python -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + ``` + +3. **Install the package in development mode**: + ```bash + pip install -e . + ``` + +4. **Install development dependencies**: + ```bash + pip install pytest pytest-cov black flake8 + ``` + +### Project Structure + +``` +mistral-ocr-python/ +├── mistral_ocr/ # Main package +│ ├── __init__.py +│ ├── __main__.py # CLI entry point +│ ├── client.py # Mistral API client +│ └── commands/ # Command implementations +│ ├── __init__.py +│ ├── convert.py # Convert command +│ ├── markdown.py # Markdown command +│ ├── process.py # Process command +│ └── version.py # Version command +├── tests/ # Test directory +├── .gitignore +├── README.md +├── CONTRIBUTING.md +├── LICENSE +├── requirements.txt +├── setup.py +└── build.sh +``` + +## Development Workflow + +### Creating a Feature + +1. **Create a new branch**: + ```bash + git checkout -b feature/your-feature-name + ``` + +2. **Make your changes**: + - Implement your feature or fix + - Add or update tests as necessary + - Update documentation to reflect your changes + +3. **Commit your changes**: + ```bash + git add . + git commit -m "Add feature: your feature description" + ``` + +### Testing + +We use pytest for testing. To run the tests: + +```bash +python -m pytest +``` + +For coverage report: + +```bash +python -m pytest --cov=mistral_ocr +``` + +Please ensure that your code is well-tested and that all tests pass before submitting a pull request. + +### Documentation + +- Update the README.md if your changes affect the usage of the tool +- Add docstrings to your code following the Google style guide +- Update or add examples if necessary + +## Pull Request Process + +1. **Push your changes to your fork**: + ```bash + git push origin feature/your-feature-name + ``` + +2. **Create a pull request** from your fork to the main repository + +3. **Describe your changes** in the pull request: + - What does this PR add or fix? + - Any breaking changes? + - Any dependencies added? + +4. **Address review comments** if any are provided + +5. **Your PR will be merged** once it's approved + +## Coding Standards + +We follow PEP 8 and use Black for code formatting: + +```bash +black mistral_ocr +``` + +For linting: + +```bash +flake8 mistral_ocr +``` + +General guidelines: + +- Use descriptive variable and function names +- Add type hints to function signatures +- Write docstrings for all functions, classes, and modules +- Keep functions small and focused on a single task +- Use comments to explain complex logic + +## Release Process + +Releases are managed by the project maintainers. If you'd like to propose a release: + +1. Update the version number in: + - `mistral_ocr/commands/version.py` + - `setup.py` + +2. Update the CHANGELOG.md with the changes in the new version + +3. Create a pull request with these changes + +4. Once merged, the maintainers will create a new release tag + +Thank you for contributing to Mistral OCR CLI! diff --git a/README.md b/README.md index d47a05b..3c26d70 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Mistral OCR CLI (Python) -A command-line tool for processing documents with Mistral AI's OCR capabilities, implemented in Python. +A command-line tool for processing documents with Mistral AI's OCR capabilities, implemented in Python. This tool allows you to extract text and structured content from PDF documents and images while preserving the original formatting and layout. ## Features @@ -10,6 +10,20 @@ A command-line tool for processing documents with Mistral AI's OCR capabilities, - Output results to stdout or to a file - Convert OCR results to Markdown format - Maintain document structure and formatting in the output +- Support for extracting and embedding images +- Metadata extraction (title, author, creation date) +- Page-by-page processing with optional single-file output + +## How It Works + +Mistral OCR CLI works by: + +1. Uploading your document to the Mistral AI API (for local files) or providing the URL +2. Processing the document using Mistral's advanced OCR capabilities +3. Receiving structured JSON data containing the extracted text, formatting, and metadata +4. Optionally converting this data to Markdown format for easy reading and editing + +The tool handles authentication, file uploads, API communication, and result formatting, making it easy to integrate OCR capabilities into your workflow. ## Installation @@ -17,6 +31,7 @@ A command-line tool for processing documents with Mistral AI's OCR capabilities, - Python 3.7 or later - pip (Python package installer) +- A Mistral AI API key (sign up at [Mistral AI](https://mistral.ai) if you don't have one) ### Installing from source @@ -26,7 +41,7 @@ cd mistral-ocr-python pip install -e . ``` -Alternatively, you can use the build script: +Alternatively, you can use the build script which creates a virtual environment and installs the package: ```bash git clone https://github.com/yourusername/mistral-ocr-python @@ -34,13 +49,19 @@ cd mistral-ocr-python ./build.sh ``` +### Installing from PyPI (coming soon) + +```bash +pip install mistral-ocr +``` + ## Usage ### Setting up your API key You can provide your Mistral API key in two ways: -1. Environment variable: +1. Environment variable (recommended for security): ```bash export MISTRAL_API_KEY=your-api-key ``` @@ -125,19 +146,19 @@ mistral-ocr version ### Examples -### Process a local PDF and save the output +#### Process a local PDF and save the output ```bash mistral-ocr process ~/Documents/sample.pdf --output-file results.json ``` -### Process a document from a URL +#### Process a document from a URL ```bash mistral-ocr process https://arxiv.org/pdf/2201.04234 > output.json ``` -### Convert OCR JSON to Markdown files +#### Convert OCR JSON to Markdown files ```bash # Create separate files (one per page) @@ -150,7 +171,7 @@ mistral-ocr convert output.json --single-file --output-dir markdown_docs mistral-ocr convert output.json --output-file docs/paper.md ``` -### Process a document and generate markdown files in one step +#### Process a document and generate markdown files in one step ```bash # Generate separate files (one per page) @@ -163,6 +184,113 @@ mistral-ocr markdown ~/Documents/research-paper.pdf --single-file --output-dir r mistral-ocr markdown ~/Documents/research-paper.pdf --output-file research_docs/paper.md ``` +## OCR Response Format + +The OCR API returns a JSON response with the following structure: + +```json +{ + "metadata": { + "title": "Document Title", + "author": "Document Author", + "creation_date": "2023-01-01", + "page_count": 5 + }, + "pages": [ + { + "index": 0, + "markdown": "# Page Content\n\nThis is the content of page 1...", + "images": [ + { + "id": "image-1", + "image_base64": "base64-encoded-image-data" + } + ] + }, + { + "index": 1, + "markdown": "## Page 2 Content\n\nThis is the content of page 2...", + "images": [] + } + ] +} +``` + +### Key Components: + +- **metadata**: Contains document-level information + - **title**: Document title (if available) + - **author**: Document author (if available) + - **creation_date**: Document creation date (if available) + - **page_count**: Total number of pages + +- **pages**: Array of page objects + - **index**: Zero-based page index + - **markdown**: Extracted text in Markdown format + - **images**: Array of images found on the page + - **id**: Unique image identifier + - **image_base64**: Base64-encoded image data (only included if `--include-images` is specified) + +## Troubleshooting + +### Common Issues + +#### API Key Issues + +``` +Error processing document: API key must be provided or set as MISTRAL_API_KEY environment variable +``` + +**Solution**: Ensure your API key is correctly set as an environment variable or provided with the `--api-key` flag. + +#### File Size Limits + +``` +Error processing document: File is too large (55.00 MB). Maximum allowed size is 52.00 MB +``` + +**Solution**: The Mistral API has a file size limit of 52MB. For larger files, consider splitting them into smaller documents. + +#### Rate Limiting + +``` +Error processing document: API returned error status: 429 - Rate limit exceeded +``` + +**Solution**: The API has rate limits. Wait a few minutes before trying again or contact Mistral AI to increase your rate limits. + +#### Invalid JSON + +``` +Error converting JSON to markdown: Expecting property name enclosed in double quotes +``` + +**Solution**: Ensure the JSON file is valid. You can validate it using tools like `jq`. + +### API Limitations + +- Maximum file size: 52MB +- Supported file formats: PDF, JPG, JPEG, PNG, WEBP, GIF +- Rate limits may apply depending on your Mistral AI account tier + +## Contributing + +Contributions to Mistral OCR CLI are welcome! Here's how you can contribute: + +1. **Fork the repository** +2. **Create a feature branch**: + ```bash + git checkout -b feature/your-feature-name + ``` +3. **Make your changes** +4. **Run tests** (if available): + ```bash + python -m unittest discover tests + ``` +5. **Submit a pull request** + +Please ensure your code follows the project's coding standards and includes appropriate tests and documentation. + ## License MIT diff --git a/mistral_ocr/__init__.py b/mistral_ocr/__init__.py index 1abd1d2..cd79cf7 100644 --- a/mistral_ocr/__init__.py +++ b/mistral_ocr/__init__.py @@ -1 +1,17 @@ -# Mistral OCR Python Package +""" +Mistral OCR Python Package + +A command-line tool for processing documents with Mistral AI's OCR capabilities. +This package provides functionality for extracting text and structured content +from PDF documents and images while preserving the original formatting and layout. + +Main components: +- Client for interacting with the Mistral AI OCR API +- Commands for processing documents, converting results to Markdown, and more +- Utilities for handling file operations and formatting + +For usage information, see the README.md file or run: + mistral-ocr --help +""" + +__version__ = "0.1.0" diff --git a/mistral_ocr/__main__.py b/mistral_ocr/__main__.py index e7b0232..fac57a1 100644 --- a/mistral_ocr/__main__.py +++ b/mistral_ocr/__main__.py @@ -4,6 +4,14 @@ import os from mistral_ocr.commands import process, convert, markdown, version def main(): + """ + Main entry point for the Mistral OCR CLI. + + Parses command line arguments and dispatches to the appropriate command handler. + + Returns: + int: Exit code (0 for success, 1 for error) + """ parser = argparse.ArgumentParser( description="A CLI tool for performing OCR on documents using Mistral AI.", prog="mistral-ocr" diff --git a/mistral_ocr/client.py b/mistral_ocr/client.py index 6191dc1..8a562d1 100644 --- a/mistral_ocr/client.py +++ b/mistral_ocr/client.py @@ -5,10 +5,33 @@ import requests from typing import Optional, Dict, Any, Tuple class MistralClient: + """ + Client for interacting with the Mistral AI OCR API. + + This client handles authentication, file uploads, and OCR processing + requests to the Mistral AI API. + + Attributes: + BASE_URL (str): Base URL for the Mistral AI API + MAX_FILE_SIZE (int): Maximum allowed file size in bytes (52 MB) + api_key (str): Mistral AI API key for authentication + session (requests.Session): Session object for making HTTP requests + """ + BASE_URL = "https://api.mistral.ai/v1" MAX_FILE_SIZE = 52 * 1024 * 1024 # 52 MB def __init__(self, api_key: Optional[str] = None): + """ + Initialize the Mistral AI client. + + Args: + api_key (Optional[str]): Mistral AI API key. If not provided, + will look for MISTRAL_API_KEY environment variable. + + Raises: + ValueError: If no API key is provided or found in environment variables. + """ self.api_key = api_key or os.environ.get("MISTRAL_API_KEY") if not self.api_key: raise ValueError("API key must be provided or set as MISTRAL_API_KEY environment variable") @@ -20,7 +43,19 @@ class MistralClient: }) def upload_file(self, file_path: str) -> str: - """Upload a file to Mistral API for OCR processing.""" + """ + Upload a file to Mistral API for OCR processing. + + Args: + file_path (str): Path to the local file to upload + + Returns: + str: File ID returned by the API + + Raises: + ValueError: If the file is too large or if the upload fails + requests.RequestException: If there's an error communicating with the API + """ # Check file size file_size = os.path.getsize(file_path) if file_size > self.MAX_FILE_SIZE: @@ -72,7 +107,19 @@ class MistralClient: raise last_error or ValueError(f"Failed to upload file after {max_retries} attempts") def get_file_url(self, file_id: str) -> str: - """Get a signed URL for an uploaded file.""" + """ + Get a signed URL for an uploaded file. + + Args: + file_id (str): ID of the file previously uploaded to the API + + Returns: + str: Signed URL that can be used for OCR processing + + Raises: + ValueError: If the API response does not contain a URL + requests.RequestException: If there's an error communicating with the API + """ response = self.session.get(f"{self.BASE_URL}/files/{file_id}/url?expiry=24") response.raise_for_status() @@ -85,7 +132,22 @@ class MistralClient: return url def process_ocr(self, doc_type: str, doc_source: str, include_image_base64: bool = False) -> bytes: - """Process a document with OCR.""" + """ + Process a document with OCR. + + Args: + doc_type (str): Type of document, either "document_url" or "image_url" + doc_source (str): URL of the document to process + include_image_base64 (bool, optional): Whether to include base64-encoded + images in the response. Defaults to False. + + Returns: + bytes: JSON response from the API containing OCR results + + Raises: + ValueError: If the document type is unsupported or if processing fails + requests.RequestException: If there's an error communicating with the API + """ if doc_type not in ["document_url", "image_url"]: raise ValueError(f"Unsupported document type: {doc_type}") diff --git a/mistral_ocr/commands/convert.py b/mistral_ocr/commands/convert.py index 8f57b1f..6c4c13f 100644 --- a/mistral_ocr/commands/convert.py +++ b/mistral_ocr/commands/convert.py @@ -5,6 +5,14 @@ import re from pathlib import Path def run(args): + """ + Main entry point for the convert command. + + Converts OCR JSON results to Markdown format. + + Args: + args: Command line arguments parsed by argparse + """ # If output file is specified, enable single file mode if args.output_file: args.single_file = True @@ -13,11 +21,28 @@ def run(args): # OCR response structure classes class OCRResponseImage: + """ + Represents an image in the OCR response. + + Attributes: + id (str): Unique identifier for the image + image_base64 (str): Base64-encoded image data + """ def __init__(self, id, image_base64): self.id = id self.image_base64 = image_base64 class OCRResponsePage: + """ + Represents a page in the OCR response. + + Attributes: + index (int): Zero-based page index + markdown (str): Extracted text content in Markdown format + image (str, optional): Main page image (if available) + images (list): List of OCRResponseImage objects + dimensions (dict, optional): Page dimensions + """ def __init__(self, index, markdown, image=None, images=None, dimensions=None): self.index = index self.markdown = markdown @@ -26,6 +51,15 @@ class OCRResponsePage: self.dimensions = dimensions class OCRResponseMetadata: + """ + Represents metadata in the OCR response. + + Attributes: + title (str, optional): Document title + author (str, optional): Document author + creation_date (str, optional): Document creation date + page_count (int, optional): Total number of pages + """ def __init__(self, title=None, author=None, creation_date=None, page_count=None): self.title = title self.author = author @@ -33,11 +67,29 @@ class OCRResponseMetadata: self.page_count = page_count class OCRResponse: + """ + Represents the complete OCR response. + + Attributes: + pages (list): List of OCRResponsePage objects + metadata (OCRResponseMetadata): Document metadata + """ def __init__(self, pages=None, metadata=None): self.pages = pages or [] self.metadata = metadata or OCRResponseMetadata() def replace_image_references(content, images, include_images): + """ + Replace image references in markdown content with base64 data. + + Args: + content (str): Markdown content with image references + images (list): List of OCRResponseImage objects + include_images (bool): Whether to include images in the output + + Returns: + str: Markdown content with image references replaced with base64 data + """ if not include_images or not images: return content @@ -62,6 +114,16 @@ def replace_image_references(content, images, include_images): return content def convert_json_to_markdown(json_file, args): + """ + Convert OCR JSON results to Markdown format. + + Args: + json_file (str): Path to the JSON file containing OCR results + args: Command line arguments containing conversion options + + Raises: + SystemExit: If an error occurs during conversion + """ try: # Read JSON file with open(json_file, 'r', encoding='utf-8') as f: diff --git a/mistral_ocr/commands/markdown.py b/mistral_ocr/commands/markdown.py index 57d89d6..c67bc71 100644 --- a/mistral_ocr/commands/markdown.py +++ b/mistral_ocr/commands/markdown.py @@ -5,6 +5,19 @@ from pathlib import Path from mistral_ocr.commands import process, convert def run(args): + """ + Main entry point for the markdown command. + + Processes a document with OCR and converts the results to Markdown in one step. + This is a convenience command that combines the functionality of the 'process' + and 'convert' commands. + + Args: + args: Command line arguments parsed by argparse + + Raises: + SystemExit: If an error occurs during processing or conversion + """ # Ensure that if --images is true, include_image_base64 is also true include_image_base64 = args.images diff --git a/mistral_ocr/commands/process.py b/mistral_ocr/commands/process.py index 393ab00..8977dbb 100644 --- a/mistral_ocr/commands/process.py +++ b/mistral_ocr/commands/process.py @@ -6,6 +6,14 @@ import urllib.parse from mistral_ocr.client import MistralClient def run(args): + """ + Main entry point for the process command. + + Processes a document with OCR, either from a URL or a local file. + + Args: + args: Command line arguments parsed by argparse + """ file_path = args.file # Determine if input is a URL or a local file @@ -15,6 +23,17 @@ def run(args): process_local_file(file_path, args.output_file, args.include_images) def process_url(url, output_file, include_image_base64): + """ + Process a document from a URL. + + Args: + url (str): URL of the document to process + output_file (str): Path to save the OCR results, or None for stdout + include_image_base64 (bool): Whether to include base64-encoded images in the output + + Raises: + SystemExit: If an error occurs during processing + """ try: client = MistralClient() @@ -35,6 +54,17 @@ def process_url(url, output_file, include_image_base64): sys.exit(1) def process_local_file(file_path, output_file, include_image_base64): + """ + Process a local document file. + + Args: + file_path (str): Path to the local file to process + output_file (str): Path to save the OCR results, or None for stdout + include_image_base64 (bool): Whether to include base64-encoded images in the output + + Raises: + SystemExit: If an error occurs during processing + """ try: print(f"Processing local file: {file_path}") @@ -71,6 +101,13 @@ def process_local_file(file_path, output_file, include_image_base64): sys.exit(1) def handle_output(data, output_file): + """ + Handle the OCR response output. + + Args: + data (bytes): JSON response data from the OCR API + output_file (str): Path to save the OCR results, or None for stdout + """ # Pretty print the JSON response pretty_json = json.dumps(json.loads(data), indent=2) diff --git a/mistral_ocr/commands/version.py b/mistral_ocr/commands/version.py index c659810..03d897c 100644 --- a/mistral_ocr/commands/version.py +++ b/mistral_ocr/commands/version.py @@ -1,6 +1,13 @@ import sys - -VERSION = "0.1.0" +from mistral_ocr import __version__ def run(args): - print(f"Mistral OCR CLI v{VERSION}") + """ + Main entry point for the version command. + + Prints the current version of the Mistral OCR CLI. + + Args: + args: Command line arguments parsed by argparse (not used) + """ + print(f"Mistral OCR CLI v{__version__}") diff --git a/requirements.txt b/requirements.txt index 4a5625c..024ae26 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,9 @@ +# Core dependencies requests>=2.25.0 + +# Optional dependencies for enhanced functionality +markdown>=3.3.0 # For converting markdown to HTML +pytest>=6.0.0 # For running tests +pytest-cov>=2.0.0 # For test coverage reports +black>=21.0.0 # For code formatting +flake8>=3.9.0 # For linting diff --git a/setup.py b/setup.py index c68a0fe..f9ebdad 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,24 @@ from setuptools import setup, find_packages +import os +import re + +# Read version from __init__.py +with open(os.path.join('mistral_ocr', '__init__.py'), 'r') as f: + version_match = re.search(r"__version__\s*=\s*['\"]([^'\"]*)['\"]", f.read()) + version = version_match.group(1) if version_match else '0.1.0' + +# Read long description from README.md +with open('README.md', 'r', encoding='utf-8') as f: + long_description = f.read() setup( name="mistral-ocr", - version="0.1.0", + version=version, description="A CLI tool for performing OCR on documents using Mistral AI", + long_description=long_description, + long_description_content_type="text/markdown", author="Mistral OCR Team", + url="https://github.com/yourusername/mistral-ocr-python", packages=find_packages(), install_requires=[ "requests>=2.25.0", @@ -23,6 +37,8 @@ setup( "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Topic :: Text Processing", + "Topic :: Utilities", ], python_requires=">=3.7", )