Initial commit

2025-05-21 21:03:52 +02:00
commit c47d3205a0
8 changed files with 328 additions and 0 deletions
@@ -0,0 +1 @@
+# mknotes.src package
@@ -0,0 +1,38 @@
+# Command-line interface for mknotes
+
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Transcribe audio files and enhance notes using GPT-4.1"
+    )
+    parser.add_argument(
+        "--input-dir",
+        type=str,
+        required=True,
+        help="Directory containing audio files (.mp3, .m4a)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="output",
+        help="Directory to save output files (default: output)"
+    )
+    parser.add_argument(
+        "--model-size",
+        type=str,
+        default="medium",
+        choices=["tiny", "base", "small", "medium", "large"],
+        help="Faster Whisper model size (default: medium)"
+    )
+    parser.add_argument(
+        "--turbo",
+        action="store_true",
+        help="Enable turbo mode for faster inference (uses int8_float16 compute type)"
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Force re-processing of files even if output files already exist"
+    )
+    return parser.parse_args()
@@ -0,0 +1,94 @@
+# Note enhancement logic using OpenAI GPT-4.1
+
+import openai
+import os
+
+SYSTEM_PROMPT = """You are Edison, an expert executive assistant to the CTO of an IT technology firm with over 22 years of experience in technology. Your task is to provide a deep-dive consultation tailored to the client's issue. Ensure your responses make the user feel understood, guided, and satisfied. The name of the CTO is Heiko.
+
+The consultation is deemed successful when the user explicitly communicates their satisfaction with the solution.
+
+**Instructions:**
+- Write clearly and straight to the point.
+- Use professional business English.
+- Use always British English, not American English.
+- Format titles, main sections and subsections:  
+  - Capitalise only the first word of each title, section, and subsection.  
+  - Keep all subsequent words in lowercase except for acronyms, abbreviations and proper nouns, which should remain in their proper uppercase form.  
+- Do not use emojis.
+- Format dates appropriately based on context:  
+  - Use the ISO format (`yyyy-MM-dd`) for technical content, such as code, specifications, tables, deadlines, or numbered/bulleted lists (e.g., 2024-02-12).  
+  - Use the British standard date format `<day> <month> <year>` in general, conversational, or non-technical text (e.g., 12 February 2024). Use the current year (2025) if no year is provided.  
+- Use the 24-hour time format (HH:mm) consistently throughout.
+- Introduce abbreviations with the full term followed by the abbreviation in parentheses on their first mention, only when the context is provided. 
+	- Do not introduce abbreviations for AI, CPU, and HPC.
+- Use only the metric system and automatically convert imperial measurements (like Fahrenheit, inches, or feet) to metric units.
+- Ensure that all phone numbers are formatted in the international format starting with a '+' followed by the country code, area code, and local number (e.g., +49-111-22223333).
+
+**Guidelines for British English:**
+British English is the form of English used in the United Kingdom, characterised by distinct spelling, vocabulary, grammar, and punctuation.
+1. **Spelling Differences:**  
+   - Use "-our" instead of "-or" (e.g., "colour" not "color", "honour" not "honor").  
+   - Use "-re" instead of "-er" (e.g., "centre" not "center", "metre" not "meter").  
+   - Prefer "-ise" over "-ize" (e.g., "realise" instead of "realize").  
+2. **Grammar Differences:**  
+   - Use the present perfect tense with "just," "yet," and "already" (e.g., "I have just eaten").  
+   - Treat collective nouns as singular or plural depending on context (e.g., "The team is winning" or "The team are playing well").  
+3. **Punctuation Usage:**  
+   - Use single quotation marks for initial quotes and double quotation marks for quotes within quotes (e.g., 'He said, "Hello."').  
+   - Place commas and periods outside quotation marks when they are not part of the quoted material (e.g., 'He said "hello", and then left.').
+"""
+
+PROMPT = """Enhance existing notes using additional context provided from audio transcription or uploaded file content. Your task is to make the notes more useful and comprehensive by incorporating relevant information from the provided context.
+
+Input will be provided within this context, providing a structure for the existing notes and context.
+
+# Output Format
+
+Provide the enhanced notes in markdown format. Use markdown syntax for headings, lists, and emphasis to improve clarity and presentation. Ensure that all integrated content from the context is accurately reflected. Return only the markdown-formatted note. Do not wrap the output in code blocks or use triple backticks."""
+
+def get_model_max_tokens(client, model_name):
+    """
+    Retrieve the maximum token limit for a given OpenAI model dynamically.
+    Falls back to 4096 if the API call fails or the information is unavailable.
+    """
+    try:
+        model_info = client.models.retrieve(model_name)
+        # OpenAI's API may provide context_window or similar attribute
+        max_tokens = getattr(model_info, "context_window", None)
+        if max_tokens is None and hasattr(model_info, "max_tokens"):
+            max_tokens = model_info.max_tokens
+        if max_tokens is None:
+            # Fallback for known models
+            if "gpt-4" in model_name:
+                max_tokens = 4096
+            else:
+                max_tokens = 2048
+    except Exception:
+        max_tokens = 4096
+    return max_tokens
+
+def enhance_note(transcription_text, api_key=None):
+    """
+    Enhance the transcription using OpenAI GPT-4.1 and return markdown-formatted notes.
+    Dynamically retrieves the model's max token limit from the OpenAI API.
+    """
+    if api_key is None:
+        api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("OpenAI API key not provided or set in environment variable OPENAI_API_KEY.")
+
+    client = openai.OpenAI(api_key=api_key)
+    model_name = "gpt-4.1"
+    max_tokens = int(get_model_max_tokens(client, model_name) * 0.9)  # Use 90% of max to allow for prompt/context
+
+    response = client.chat.completions.create(
+        model=model_name,
+        messages=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": PROMPT},
+            {"role": "user", "content": transcription_text}
+        ],
+        max_tokens=max_tokens,
+        temperature=0.3
+    )
+    return response.choices[0].message.content.strip()
@@ -0,0 +1,44 @@
+# Audio transcription logic using Faster Whisper
+
+from faster_whisper import WhisperModel
+
+def transcribe_audio(input_path, output_path, model_size="medium", turbo=False):
+    """
+    Transcribe an audio file using Faster Whisper and save the result as a .txt file.
+
+    Args:
+        input_path: Path to the audio file
+        output_path: Path to save the transcription
+        model_size: Size of the Whisper model to use
+        turbo: Whether to use int8_float16 compute type for faster inference
+    """
+    # Try compute types in order of preference
+    if turbo:
+        compute_types = ["int8_float16", "float16", "int8", "float32"]
+    else:
+        compute_types = ["float16", "int8", "float32"]
+
+    model = None
+    for compute_type in compute_types:
+        try:
+            print(f"Attempting to initialize model with compute_type: {compute_type}")
+            model = WhisperModel(model_size, compute_type=compute_type)
+            print(f"Successfully initialized model with compute_type: {compute_type}")
+            break
+        except ValueError as e:
+            if "compute type" in str(e) and compute_type != compute_types[-1]:
+                print(f"Warning: {compute_type} compute type not supported by your hardware.")
+                print(f"Trying next compute type...")
+            else:
+                raise
+
+    if model is None:
+        raise RuntimeError("Failed to initialize Whisper model with any compute type")
+
+    segments, info = model.transcribe(input_path)
+    transcription = ""
+    for segment in segments:
+        transcription += segment.text.strip() + "\n"
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(transcription.strip())
+    return transcription.strip()
@@ -0,0 +1,21 @@
+# Helper functions for mknotes
+
+import os
+
+def find_audio_files(directory, extensions=(".mp3", ".m4a")):
+    """
+    Recursively find all audio files in the given directory with the specified extensions.
+    Returns a list of file paths.
+    """
+    audio_files = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.lower().endswith(extensions):
+                audio_files.append(os.path.join(root, file))
+    return audio_files
+
+def ensure_directory_exists(directory):
+    """
+    Create the directory if it does not exist.
+    """
+    os.makedirs(directory, exist_ok=True)