commit c47d3205a0a801e4f9753b1a6c34a50f11353f4d Author: Heiko Joerg Schick Date: Wed May 21 21:03:52 2025 +0200 Initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..326400c --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +# mknotes + +A command-line tool to transcribe all MP3 and M4A audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model. + +## Features + +- Batch transcribes all `.mp3` and `.m4a` files in a specified directory +- Saves transcriptions as `.txt` files +- Enhances notes using GPT-4.1 with a custom prompt +- Outputs enhanced notes in markdown format +- Configurable input and output directories + +## Installation + +```bash +# Clone the repository +git clone https://github.com/yourusername/mknotes.git +cd mknotes + +# Create a virtual environment +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install dependencies +pip install -r requirements.txt +``` + +## Usage + +```bash +export OPENAI_API_KEY="your-api-key-here" +python main.py --input-dir /path/to/audio/files --output-dir /path/to/output [--turbo] +``` + +- `--input-dir`: Directory containing audio files (required) +- `--output-dir`: Directory for output files (default: "output") +- `--turbo`: Enable turbo mode for faster inference (uses int8_float16 compute type) +- `--force`: Force re-processing of files even if output files already exist + +### Turbo Mode Hardware Requirements + +The `--turbo` flag enables faster inference using the `int8_float16` compute type, which can significantly speed up transcription. However, this requires: + +- CUDA-compatible GPU with Tensor Cores (NVIDIA Ampere, Turing, or newer architecture) +- Or CPU with AVX2 support + +If your hardware does not support this optimization, the program will automatically fall back to the next most compatible compute type and print a warning. + +#### Compute Type Fallback + +The program will attempt to use the most efficient compute type supported by your hardware, in the following order: + +- `int8_float16` (if `--turbo` is enabled) +- `float16` +- `int8` +- `float32` (most compatible, works on virtually all hardware) + +If a compute type is not supported, the program will try the next one in the list until successful. + +## Requirements + +- Python 3.8+ +- [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) +- [OpenAI Python SDK](https://github.com/openai/openai-python) diff --git a/main.py b/main.py new file mode 100644 index 0000000..c43b4db --- /dev/null +++ b/main.py @@ -0,0 +1,62 @@ +# Entry point for the mknotes audio transcription and note enhancement tool + +import os +from tqdm import tqdm + +from src.cli import parse_args +from src.utils import find_audio_files, ensure_directory_exists +from src.transcriber import transcribe_audio +from src.enhancer import enhance_note + +def main(): + args = parse_args() + input_dir = args.input_dir + output_dir = args.output_dir + model_size = args.model_size + + ensure_directory_exists(output_dir) + + audio_files = find_audio_files(input_dir) + if not audio_files: + print(f"No audio files found in {input_dir}.") + return + + print(f"Found {len(audio_files)} audio files. Starting transcription and enhancement...") + + for audio_path in tqdm(audio_files, desc="Processing files"): + base_name = os.path.splitext(os.path.basename(audio_path))[0] + txt_path = os.path.join(output_dir, base_name + ".txt") + md_path = os.path.join(output_dir, base_name + ".md") + + # Skip if enhanced note already exists (unless force flag is set) + if os.path.exists(md_path) and not args.force: + print(f"Skipping {audio_path} - enhanced note already exists at {md_path}") + continue + + # Check if transcription exists + if os.path.exists(txt_path) and not args.force: + print(f"Using existing transcription for {audio_path}") + with open(txt_path, "r", encoding="utf-8") as f: + transcription = f.read() + else: + # Transcribe audio + transcription = transcribe_audio( + audio_path, + txt_path, + model_size=model_size, + turbo=args.turbo + ) + + # Enhance note (only if md file doesn't exist or force flag is set) + if not os.path.exists(md_path) or args.force: + try: + enhanced_note = enhance_note(transcription) + with open(md_path, "w", encoding="utf-8") as f: + f.write(enhanced_note) + except Exception as e: + print(f"Error enhancing note for {audio_path}: {e}") + + print(f"Processing complete. Output saved to: {output_dir}") + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8852add --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +faster-whisper +openai +argparse +tqdm diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..042ed8a --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +# mknotes.src package diff --git a/src/cli.py b/src/cli.py new file mode 100644 index 0000000..10e3d19 --- /dev/null +++ b/src/cli.py @@ -0,0 +1,38 @@ +# Command-line interface for mknotes + +import argparse + +def parse_args(): + parser = argparse.ArgumentParser( + description="Transcribe audio files and enhance notes using GPT-4.1" + ) + parser.add_argument( + "--input-dir", + type=str, + required=True, + help="Directory containing audio files (.mp3, .m4a)" + ) + parser.add_argument( + "--output-dir", + type=str, + default="output", + help="Directory to save output files (default: output)" + ) + parser.add_argument( + "--model-size", + type=str, + default="medium", + choices=["tiny", "base", "small", "medium", "large"], + help="Faster Whisper model size (default: medium)" + ) + parser.add_argument( + "--turbo", + action="store_true", + help="Enable turbo mode for faster inference (uses int8_float16 compute type)" + ) + parser.add_argument( + "--force", + action="store_true", + help="Force re-processing of files even if output files already exist" + ) + return parser.parse_args() diff --git a/src/enhancer.py b/src/enhancer.py new file mode 100644 index 0000000..ff85881 --- /dev/null +++ b/src/enhancer.py @@ -0,0 +1,94 @@ +# Note enhancement logic using OpenAI GPT-4.1 + +import openai +import os + +SYSTEM_PROMPT = """You are Edison, an expert executive assistant to the CTO of an IT technology firm with over 22 years of experience in technology. Your task is to provide a deep-dive consultation tailored to the client's issue. Ensure your responses make the user feel understood, guided, and satisfied. The name of the CTO is Heiko. + +The consultation is deemed successful when the user explicitly communicates their satisfaction with the solution. + +**Instructions:** +- Write clearly and straight to the point. +- Use professional business English. +- Use always British English, not American English. +- Format titles, main sections and subsections: + - Capitalise only the first word of each title, section, and subsection. + - Keep all subsequent words in lowercase except for acronyms, abbreviations and proper nouns, which should remain in their proper uppercase form. +- Do not use emojis. +- Format dates appropriately based on context: + - Use the ISO format (`yyyy-MM-dd`) for technical content, such as code, specifications, tables, deadlines, or numbered/bulleted lists (e.g., 2024-02-12). + - Use the British standard date format ` ` in general, conversational, or non-technical text (e.g., 12 February 2024). Use the current year (2025) if no year is provided. +- Use the 24-hour time format (HH:mm) consistently throughout. +- Introduce abbreviations with the full term followed by the abbreviation in parentheses on their first mention, only when the context is provided. + - Do not introduce abbreviations for AI, CPU, and HPC. +- Use only the metric system and automatically convert imperial measurements (like Fahrenheit, inches, or feet) to metric units. +- Ensure that all phone numbers are formatted in the international format starting with a '+' followed by the country code, area code, and local number (e.g., +49-111-22223333). + +**Guidelines for British English:** +British English is the form of English used in the United Kingdom, characterised by distinct spelling, vocabulary, grammar, and punctuation. +1. **Spelling Differences:** + - Use "-our" instead of "-or" (e.g., "colour" not "color", "honour" not "honor"). + - Use "-re" instead of "-er" (e.g., "centre" not "center", "metre" not "meter"). + - Prefer "-ise" over "-ize" (e.g., "realise" instead of "realize"). +2. **Grammar Differences:** + - Use the present perfect tense with "just," "yet," and "already" (e.g., "I have just eaten"). + - Treat collective nouns as singular or plural depending on context (e.g., "The team is winning" or "The team are playing well"). +3. **Punctuation Usage:** + - Use single quotation marks for initial quotes and double quotation marks for quotes within quotes (e.g., 'He said, "Hello."'). + - Place commas and periods outside quotation marks when they are not part of the quoted material (e.g., 'He said "hello", and then left.'). +""" + +PROMPT = """Enhance existing notes using additional context provided from audio transcription or uploaded file content. Your task is to make the notes more useful and comprehensive by incorporating relevant information from the provided context. + +Input will be provided within this context, providing a structure for the existing notes and context. + +# Output Format + +Provide the enhanced notes in markdown format. Use markdown syntax for headings, lists, and emphasis to improve clarity and presentation. Ensure that all integrated content from the context is accurately reflected. Return only the markdown-formatted note. Do not wrap the output in code blocks or use triple backticks.""" + +def get_model_max_tokens(client, model_name): + """ + Retrieve the maximum token limit for a given OpenAI model dynamically. + Falls back to 4096 if the API call fails or the information is unavailable. + """ + try: + model_info = client.models.retrieve(model_name) + # OpenAI's API may provide context_window or similar attribute + max_tokens = getattr(model_info, "context_window", None) + if max_tokens is None and hasattr(model_info, "max_tokens"): + max_tokens = model_info.max_tokens + if max_tokens is None: + # Fallback for known models + if "gpt-4" in model_name: + max_tokens = 4096 + else: + max_tokens = 2048 + except Exception: + max_tokens = 4096 + return max_tokens + +def enhance_note(transcription_text, api_key=None): + """ + Enhance the transcription using OpenAI GPT-4.1 and return markdown-formatted notes. + Dynamically retrieves the model's max token limit from the OpenAI API. + """ + if api_key is None: + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise ValueError("OpenAI API key not provided or set in environment variable OPENAI_API_KEY.") + + client = openai.OpenAI(api_key=api_key) + model_name = "gpt-4.1" + max_tokens = int(get_model_max_tokens(client, model_name) * 0.9) # Use 90% of max to allow for prompt/context + + response = client.chat.completions.create( + model=model_name, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": PROMPT}, + {"role": "user", "content": transcription_text} + ], + max_tokens=max_tokens, + temperature=0.3 + ) + return response.choices[0].message.content.strip() diff --git a/src/transcriber.py b/src/transcriber.py new file mode 100644 index 0000000..c493cfd --- /dev/null +++ b/src/transcriber.py @@ -0,0 +1,44 @@ +# Audio transcription logic using Faster Whisper + +from faster_whisper import WhisperModel + +def transcribe_audio(input_path, output_path, model_size="medium", turbo=False): + """ + Transcribe an audio file using Faster Whisper and save the result as a .txt file. + + Args: + input_path: Path to the audio file + output_path: Path to save the transcription + model_size: Size of the Whisper model to use + turbo: Whether to use int8_float16 compute type for faster inference + """ + # Try compute types in order of preference + if turbo: + compute_types = ["int8_float16", "float16", "int8", "float32"] + else: + compute_types = ["float16", "int8", "float32"] + + model = None + for compute_type in compute_types: + try: + print(f"Attempting to initialize model with compute_type: {compute_type}") + model = WhisperModel(model_size, compute_type=compute_type) + print(f"Successfully initialized model with compute_type: {compute_type}") + break + except ValueError as e: + if "compute type" in str(e) and compute_type != compute_types[-1]: + print(f"Warning: {compute_type} compute type not supported by your hardware.") + print(f"Trying next compute type...") + else: + raise + + if model is None: + raise RuntimeError("Failed to initialize Whisper model with any compute type") + + segments, info = model.transcribe(input_path) + transcription = "" + for segment in segments: + transcription += segment.text.strip() + "\n" + with open(output_path, "w", encoding="utf-8") as f: + f.write(transcription.strip()) + return transcription.strip() diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..9802897 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,21 @@ +# Helper functions for mknotes + +import os + +def find_audio_files(directory, extensions=(".mp3", ".m4a")): + """ + Recursively find all audio files in the given directory with the specified extensions. + Returns a list of file paths. + """ + audio_files = [] + for root, _, files in os.walk(directory): + for file in files: + if file.lower().endswith(extensions): + audio_files.append(os.path.join(root, file)) + return audio_files + +def ensure_directory_exists(directory): + """ + Create the directory if it does not exist. + """ + os.makedirs(directory, exist_ok=True)