Initial commit

2025-05-21 21:03:52 +02:00
commit c47d3205a0
8 changed files with 328 additions and 0 deletions
@@ -0,0 +1,64 @@
 # mknotes
 A command-line tool to transcribe all MP3 and M4A audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model.
 ## Features
 - Batch transcribes all `.mp3` and `.m4a` files in a specified directory
 - Saves transcriptions as `.txt` files
 - Enhances notes using GPT-4.1 with a custom prompt
 - Outputs enhanced notes in markdown format
 - Configurable input and output directories
 ## Installation
 ```bash
 # Clone the repository
 git clone https://github.com/yourusername/mknotes.git
 cd mknotes
 # Create a virtual environment
 python -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 # Install dependencies
 pip install -r requirements.txt
 ```
 ## Usage
 ```bash
 export OPENAI_API_KEY="your-api-key-here"
 python main.py --input-dir /path/to/audio/files --output-dir /path/to/output [--turbo]
 ```
 - `--input-dir`: Directory containing audio files (required)
 - `--output-dir`: Directory for output files (default: "output")
 - `--turbo`: Enable turbo mode for faster inference (uses int8_float16 compute type)
 - `--force`: Force re-processing of files even if output files already exist
 ### Turbo Mode Hardware Requirements
 The `--turbo` flag enables faster inference using the `int8_float16` compute type, which can significantly speed up transcription. However, this requires:
 - CUDA-compatible GPU with Tensor Cores (NVIDIA Ampere, Turing, or newer architecture)
 - Or CPU with AVX2 support
 If your hardware does not support this optimization, the program will automatically fall back to the next most compatible compute type and print a warning.
 #### Compute Type Fallback
 The program will attempt to use the most efficient compute type supported by your hardware, in the following order:
 - `int8_float16` (if `--turbo` is enabled)
 - `float16`
 - `int8`
 - `float32` (most compatible, works on virtually all hardware)
 If a compute type is not supported, the program will try the next one in the list until successful.
 ## Requirements
 - Python 3.8+
 - [Faster Whisper](https://github.com/SYSTRAN/faster-whisper)
 - [OpenAI Python SDK](https://github.com/openai/openai-python)
@@ -0,0 +1,62 @@
 # Entry point for the mknotes audio transcription and note enhancement tool
 import os
 from tqdm import tqdm
 from src.cli import parse_args
 from src.utils import find_audio_files, ensure_directory_exists
 from src.transcriber import transcribe_audio
 from src.enhancer import enhance_note
 def main():
    args = parse_args()
    input_dir = args.input_dir
    output_dir = args.output_dir
    model_size = args.model_size
    ensure_directory_exists(output_dir)
    audio_files = find_audio_files(input_dir)
    if not audio_files:
        print(f"No audio files found in {input_dir}.")
        return
    print(f"Found {len(audio_files)} audio files. Starting transcription and enhancement...")
    for audio_path in tqdm(audio_files, desc="Processing files"):
        base_name = os.path.splitext(os.path.basename(audio_path))[0]
        txt_path = os.path.join(output_dir, base_name + ".txt")
        md_path = os.path.join(output_dir, base_name + ".md")
        # Skip if enhanced note already exists (unless force flag is set)
        if os.path.exists(md_path) and not args.force:
            print(f"Skipping {audio_path} - enhanced note already exists at {md_path}")
            continue
        # Check if transcription exists
        if os.path.exists(txt_path) and not args.force:
            print(f"Using existing transcription for {audio_path}")
            with open(txt_path, "r", encoding="utf-8") as f:
                transcription = f.read()
        else:
            # Transcribe audio
            transcription = transcribe_audio(
                audio_path,
                txt_path,
                model_size=model_size,
                turbo=args.turbo
            )
        # Enhance note (only if md file doesn't exist or force flag is set)
        if not os.path.exists(md_path) or args.force:
            try:
                enhanced_note = enhance_note(transcription)
                with open(md_path, "w", encoding="utf-8") as f:
                    f.write(enhanced_note)
            except Exception as e:
                print(f"Error enhancing note for {audio_path}: {e}")
    print(f"Processing complete. Output saved to: {output_dir}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,4 @@
 faster-whisper
 openai
 argparse
 tqdm
@@ -0,0 +1 @@
 # mknotes.src package
@@ -0,0 +1,38 @@
 # Command-line interface for mknotes
 import argparse
 def parse_args():
    parser = argparse.ArgumentParser(
        description="Transcribe audio files and enhance notes using GPT-4.1"
    )
    parser.add_argument(
        "--input-dir",
        type=str,
        required=True,
        help="Directory containing audio files (.mp3, .m4a)"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="output",
        help="Directory to save output files (default: output)"
    )
    parser.add_argument(
        "--model-size",
        type=str,
        default="medium",
        choices=["tiny", "base", "small", "medium", "large"],
        help="Faster Whisper model size (default: medium)"
    )
    parser.add_argument(
        "--turbo",
        action="store_true",
        help="Enable turbo mode for faster inference (uses int8_float16 compute type)"
    )
    parser.add_argument(
        "--force",
        action="store_true",
        help="Force re-processing of files even if output files already exist"
    )
    return parser.parse_args()
@@ -0,0 +1,94 @@
 # Note enhancement logic using OpenAI GPT-4.1
 import openai
 import os
 SYSTEM_PROMPT = """You are Edison, an expert executive assistant to the CTO of an IT technology firm with over 22 years of experience in technology. Your task is to provide a deep-dive consultation tailored to the client's issue. Ensure your responses make the user feel understood, guided, and satisfied. The name of the CTO is Heiko.
 The consultation is deemed successful when the user explicitly communicates their satisfaction with the solution.
 **Instructions:**
 - Write clearly and straight to the point.
 - Use professional business English.
 - Use always British English, not American English.
 - Format titles, main sections and subsections:  
  - Capitalise only the first word of each title, section, and subsection.  
  - Keep all subsequent words in lowercase except for acronyms, abbreviations and proper nouns, which should remain in their proper uppercase form.  
 - Do not use emojis.
 - Format dates appropriately based on context:  
  - Use the ISO format (`yyyy-MM-dd`) for technical content, such as code, specifications, tables, deadlines, or numbered/bulleted lists (e.g., 2024-02-12).  
  - Use the British standard date format `<day> <month> <year>` in general, conversational, or non-technical text (e.g., 12 February 2024). Use the current year (2025) if no year is provided.  
 - Use the 24-hour time format (HH:mm) consistently throughout.
 - Introduce abbreviations with the full term followed by the abbreviation in parentheses on their first mention, only when the context is provided. 
 	- Do not introduce abbreviations for AI, CPU, and HPC.
 - Use only the metric system and automatically convert imperial measurements (like Fahrenheit, inches, or feet) to metric units.
 - Ensure that all phone numbers are formatted in the international format starting with a '+' followed by the country code, area code, and local number (e.g., +49-111-22223333).
 **Guidelines for British English:**
 British English is the form of English used in the United Kingdom, characterised by distinct spelling, vocabulary, grammar, and punctuation.
 1. **Spelling Differences:**  
   - Use "-our" instead of "-or" (e.g., "colour" not "color", "honour" not "honor").  
   - Use "-re" instead of "-er" (e.g., "centre" not "center", "metre" not "meter").  
   - Prefer "-ise" over "-ize" (e.g., "realise" instead of "realize").  
 2. **Grammar Differences:**  
   - Use the present perfect tense with "just," "yet," and "already" (e.g., "I have just eaten").  
   - Treat collective nouns as singular or plural depending on context (e.g., "The team is winning" or "The team are playing well").  
 3. **Punctuation Usage:**  
   - Use single quotation marks for initial quotes and double quotation marks for quotes within quotes (e.g., 'He said, "Hello."').  
   - Place commas and periods outside quotation marks when they are not part of the quoted material (e.g., 'He said "hello", and then left.').
 """
 PROMPT = """Enhance existing notes using additional context provided from audio transcription or uploaded file content. Your task is to make the notes more useful and comprehensive by incorporating relevant information from the provided context.
 Input will be provided within this context, providing a structure for the existing notes and context.
 # Output Format
 Provide the enhanced notes in markdown format. Use markdown syntax for headings, lists, and emphasis to improve clarity and presentation. Ensure that all integrated content from the context is accurately reflected. Return only the markdown-formatted note. Do not wrap the output in code blocks or use triple backticks."""
 def get_model_max_tokens(client, model_name):
    """
    Retrieve the maximum token limit for a given OpenAI model dynamically.
    Falls back to 4096 if the API call fails or the information is unavailable.
    """
    try:
        model_info = client.models.retrieve(model_name)
        # OpenAI's API may provide context_window or similar attribute
        max_tokens = getattr(model_info, "context_window", None)
        if max_tokens is None and hasattr(model_info, "max_tokens"):
            max_tokens = model_info.max_tokens
        if max_tokens is None:
            # Fallback for known models
            if "gpt-4" in model_name:
                max_tokens = 4096
            else:
                max_tokens = 2048
    except Exception:
        max_tokens = 4096
    return max_tokens
 def enhance_note(transcription_text, api_key=None):
    """
    Enhance the transcription using OpenAI GPT-4.1 and return markdown-formatted notes.
    Dynamically retrieves the model's max token limit from the OpenAI API.
    """
    if api_key is None:
        api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OpenAI API key not provided or set in environment variable OPENAI_API_KEY.")
    client = openai.OpenAI(api_key=api_key)
    model_name = "gpt-4.1"
    max_tokens = int(get_model_max_tokens(client, model_name) * 0.9)  # Use 90% of max to allow for prompt/context
    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": PROMPT},
            {"role": "user", "content": transcription_text}
        ],
        max_tokens=max_tokens,
        temperature=0.3
    )
    return response.choices[0].message.content.strip()
@@ -0,0 +1,44 @@
 # Audio transcription logic using Faster Whisper
 from faster_whisper import WhisperModel
 def transcribe_audio(input_path, output_path, model_size="medium", turbo=False):
    """
    Transcribe an audio file using Faster Whisper and save the result as a .txt file.
    Args:
        input_path: Path to the audio file
        output_path: Path to save the transcription
        model_size: Size of the Whisper model to use
        turbo: Whether to use int8_float16 compute type for faster inference
    """
    # Try compute types in order of preference
    if turbo:
        compute_types = ["int8_float16", "float16", "int8", "float32"]
    else:
        compute_types = ["float16", "int8", "float32"]
    model = None
    for compute_type in compute_types:
        try:
            print(f"Attempting to initialize model with compute_type: {compute_type}")
            model = WhisperModel(model_size, compute_type=compute_type)
            print(f"Successfully initialized model with compute_type: {compute_type}")
            break
        except ValueError as e:
            if "compute type" in str(e) and compute_type != compute_types[-1]:
                print(f"Warning: {compute_type} compute type not supported by your hardware.")
                print(f"Trying next compute type...")
            else:
                raise
    if model is None:
        raise RuntimeError("Failed to initialize Whisper model with any compute type")
    segments, info = model.transcribe(input_path)
    transcription = ""
    for segment in segments:
        transcription += segment.text.strip() + "\n"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(transcription.strip())
    return transcription.strip()
@@ -0,0 +1,21 @@
 # Helper functions for mknotes
 import os
 def find_audio_files(directory, extensions=(".mp3", ".m4a")):
    """
    Recursively find all audio files in the given directory with the specified extensions.
    Returns a list of file paths.
    """
    audio_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(extensions):
                audio_files.append(os.path.join(root, file))
    return audio_files
 def ensure_directory_exists(directory):
    """
    Create the directory if it does not exist.
    """
    os.makedirs(directory, exist_ok=True)