Initial commit

This commit is contained in:
2025-05-21 21:03:52 +02:00
commit c47d3205a0
8 changed files with 328 additions and 0 deletions
+64
View File
@@ -0,0 +1,64 @@
# mknotes
A command-line tool to transcribe all MP3 and M4A audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model.
## Features
- Batch transcribes all `.mp3` and `.m4a` files in a specified directory
- Saves transcriptions as `.txt` files
- Enhances notes using GPT-4.1 with a custom prompt
- Outputs enhanced notes in markdown format
- Configurable input and output directories
## Installation
```bash
# Clone the repository
git clone https://github.com/yourusername/mknotes.git
cd mknotes
# Create a virtual environment
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
# Install dependencies
pip install -r requirements.txt
```
## Usage
```bash
export OPENAI_API_KEY="your-api-key-here"
python main.py --input-dir /path/to/audio/files --output-dir /path/to/output [--turbo]
```
- `--input-dir`: Directory containing audio files (required)
- `--output-dir`: Directory for output files (default: "output")
- `--turbo`: Enable turbo mode for faster inference (uses int8_float16 compute type)
- `--force`: Force re-processing of files even if output files already exist
### Turbo Mode Hardware Requirements
The `--turbo` flag enables faster inference using the `int8_float16` compute type, which can significantly speed up transcription. However, this requires:
- CUDA-compatible GPU with Tensor Cores (NVIDIA Ampere, Turing, or newer architecture)
- Or CPU with AVX2 support
If your hardware does not support this optimization, the program will automatically fall back to the next most compatible compute type and print a warning.
#### Compute Type Fallback
The program will attempt to use the most efficient compute type supported by your hardware, in the following order:
- `int8_float16` (if `--turbo` is enabled)
- `float16`
- `int8`
- `float32` (most compatible, works on virtually all hardware)
If a compute type is not supported, the program will try the next one in the list until successful.
## Requirements
- Python 3.8+
- [Faster Whisper](https://github.com/SYSTRAN/faster-whisper)
- [OpenAI Python SDK](https://github.com/openai/openai-python)
+62
View File
@@ -0,0 +1,62 @@
# Entry point for the mknotes audio transcription and note enhancement tool
import os
from tqdm import tqdm
from src.cli import parse_args
from src.utils import find_audio_files, ensure_directory_exists
from src.transcriber import transcribe_audio
from src.enhancer import enhance_note
def main():
args = parse_args()
input_dir = args.input_dir
output_dir = args.output_dir
model_size = args.model_size
ensure_directory_exists(output_dir)
audio_files = find_audio_files(input_dir)
if not audio_files:
print(f"No audio files found in {input_dir}.")
return
print(f"Found {len(audio_files)} audio files. Starting transcription and enhancement...")
for audio_path in tqdm(audio_files, desc="Processing files"):
base_name = os.path.splitext(os.path.basename(audio_path))[0]
txt_path = os.path.join(output_dir, base_name + ".txt")
md_path = os.path.join(output_dir, base_name + ".md")
# Skip if enhanced note already exists (unless force flag is set)
if os.path.exists(md_path) and not args.force:
print(f"Skipping {audio_path} - enhanced note already exists at {md_path}")
continue
# Check if transcription exists
if os.path.exists(txt_path) and not args.force:
print(f"Using existing transcription for {audio_path}")
with open(txt_path, "r", encoding="utf-8") as f:
transcription = f.read()
else:
# Transcribe audio
transcription = transcribe_audio(
audio_path,
txt_path,
model_size=model_size,
turbo=args.turbo
)
# Enhance note (only if md file doesn't exist or force flag is set)
if not os.path.exists(md_path) or args.force:
try:
enhanced_note = enhance_note(transcription)
with open(md_path, "w", encoding="utf-8") as f:
f.write(enhanced_note)
except Exception as e:
print(f"Error enhancing note for {audio_path}: {e}")
print(f"Processing complete. Output saved to: {output_dir}")
if __name__ == "__main__":
main()
+4
View File
@@ -0,0 +1,4 @@
faster-whisper
openai
argparse
tqdm
+1
View File
@@ -0,0 +1 @@
# mknotes.src package
+38
View File
@@ -0,0 +1,38 @@
# Command-line interface for mknotes
import argparse
def parse_args():
parser = argparse.ArgumentParser(
description="Transcribe audio files and enhance notes using GPT-4.1"
)
parser.add_argument(
"--input-dir",
type=str,
required=True,
help="Directory containing audio files (.mp3, .m4a)"
)
parser.add_argument(
"--output-dir",
type=str,
default="output",
help="Directory to save output files (default: output)"
)
parser.add_argument(
"--model-size",
type=str,
default="medium",
choices=["tiny", "base", "small", "medium", "large"],
help="Faster Whisper model size (default: medium)"
)
parser.add_argument(
"--turbo",
action="store_true",
help="Enable turbo mode for faster inference (uses int8_float16 compute type)"
)
parser.add_argument(
"--force",
action="store_true",
help="Force re-processing of files even if output files already exist"
)
return parser.parse_args()
+94
View File
@@ -0,0 +1,94 @@
# Note enhancement logic using OpenAI GPT-4.1
import openai
import os
SYSTEM_PROMPT = """You are Edison, an expert executive assistant to the CTO of an IT technology firm with over 22 years of experience in technology. Your task is to provide a deep-dive consultation tailored to the client's issue. Ensure your responses make the user feel understood, guided, and satisfied. The name of the CTO is Heiko.
The consultation is deemed successful when the user explicitly communicates their satisfaction with the solution.
**Instructions:**
- Write clearly and straight to the point.
- Use professional business English.
- Use always British English, not American English.
- Format titles, main sections and subsections:
- Capitalise only the first word of each title, section, and subsection.
- Keep all subsequent words in lowercase except for acronyms, abbreviations and proper nouns, which should remain in their proper uppercase form.
- Do not use emojis.
- Format dates appropriately based on context:
- Use the ISO format (`yyyy-MM-dd`) for technical content, such as code, specifications, tables, deadlines, or numbered/bulleted lists (e.g., 2024-02-12).
- Use the British standard date format `<day> <month> <year>` in general, conversational, or non-technical text (e.g., 12 February 2024). Use the current year (2025) if no year is provided.
- Use the 24-hour time format (HH:mm) consistently throughout.
- Introduce abbreviations with the full term followed by the abbreviation in parentheses on their first mention, only when the context is provided.
- Do not introduce abbreviations for AI, CPU, and HPC.
- Use only the metric system and automatically convert imperial measurements (like Fahrenheit, inches, or feet) to metric units.
- Ensure that all phone numbers are formatted in the international format starting with a '+' followed by the country code, area code, and local number (e.g., +49-111-22223333).
**Guidelines for British English:**
British English is the form of English used in the United Kingdom, characterised by distinct spelling, vocabulary, grammar, and punctuation.
1. **Spelling Differences:**
- Use "-our" instead of "-or" (e.g., "colour" not "color", "honour" not "honor").
- Use "-re" instead of "-er" (e.g., "centre" not "center", "metre" not "meter").
- Prefer "-ise" over "-ize" (e.g., "realise" instead of "realize").
2. **Grammar Differences:**
- Use the present perfect tense with "just," "yet," and "already" (e.g., "I have just eaten").
- Treat collective nouns as singular or plural depending on context (e.g., "The team is winning" or "The team are playing well").
3. **Punctuation Usage:**
- Use single quotation marks for initial quotes and double quotation marks for quotes within quotes (e.g., 'He said, "Hello."').
- Place commas and periods outside quotation marks when they are not part of the quoted material (e.g., 'He said "hello", and then left.').
"""
PROMPT = """Enhance existing notes using additional context provided from audio transcription or uploaded file content. Your task is to make the notes more useful and comprehensive by incorporating relevant information from the provided context.
Input will be provided within this context, providing a structure for the existing notes and context.
# Output Format
Provide the enhanced notes in markdown format. Use markdown syntax for headings, lists, and emphasis to improve clarity and presentation. Ensure that all integrated content from the context is accurately reflected. Return only the markdown-formatted note. Do not wrap the output in code blocks or use triple backticks."""
def get_model_max_tokens(client, model_name):
"""
Retrieve the maximum token limit for a given OpenAI model dynamically.
Falls back to 4096 if the API call fails or the information is unavailable.
"""
try:
model_info = client.models.retrieve(model_name)
# OpenAI's API may provide context_window or similar attribute
max_tokens = getattr(model_info, "context_window", None)
if max_tokens is None and hasattr(model_info, "max_tokens"):
max_tokens = model_info.max_tokens
if max_tokens is None:
# Fallback for known models
if "gpt-4" in model_name:
max_tokens = 4096
else:
max_tokens = 2048
except Exception:
max_tokens = 4096
return max_tokens
def enhance_note(transcription_text, api_key=None):
"""
Enhance the transcription using OpenAI GPT-4.1 and return markdown-formatted notes.
Dynamically retrieves the model's max token limit from the OpenAI API.
"""
if api_key is None:
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("OpenAI API key not provided or set in environment variable OPENAI_API_KEY.")
client = openai.OpenAI(api_key=api_key)
model_name = "gpt-4.1"
max_tokens = int(get_model_max_tokens(client, model_name) * 0.9) # Use 90% of max to allow for prompt/context
response = client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": PROMPT},
{"role": "user", "content": transcription_text}
],
max_tokens=max_tokens,
temperature=0.3
)
return response.choices[0].message.content.strip()
+44
View File
@@ -0,0 +1,44 @@
# Audio transcription logic using Faster Whisper
from faster_whisper import WhisperModel
def transcribe_audio(input_path, output_path, model_size="medium", turbo=False):
"""
Transcribe an audio file using Faster Whisper and save the result as a .txt file.
Args:
input_path: Path to the audio file
output_path: Path to save the transcription
model_size: Size of the Whisper model to use
turbo: Whether to use int8_float16 compute type for faster inference
"""
# Try compute types in order of preference
if turbo:
compute_types = ["int8_float16", "float16", "int8", "float32"]
else:
compute_types = ["float16", "int8", "float32"]
model = None
for compute_type in compute_types:
try:
print(f"Attempting to initialize model with compute_type: {compute_type}")
model = WhisperModel(model_size, compute_type=compute_type)
print(f"Successfully initialized model with compute_type: {compute_type}")
break
except ValueError as e:
if "compute type" in str(e) and compute_type != compute_types[-1]:
print(f"Warning: {compute_type} compute type not supported by your hardware.")
print(f"Trying next compute type...")
else:
raise
if model is None:
raise RuntimeError("Failed to initialize Whisper model with any compute type")
segments, info = model.transcribe(input_path)
transcription = ""
for segment in segments:
transcription += segment.text.strip() + "\n"
with open(output_path, "w", encoding="utf-8") as f:
f.write(transcription.strip())
return transcription.strip()
+21
View File
@@ -0,0 +1,21 @@
# Helper functions for mknotes
import os
def find_audio_files(directory, extensions=(".mp3", ".m4a")):
"""
Recursively find all audio files in the given directory with the specified extensions.
Returns a list of file paths.
"""
audio_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.lower().endswith(extensions):
audio_files.append(os.path.join(root, file))
return audio_files
def ensure_directory_exists(directory):
"""
Create the directory if it does not exist.
"""
os.makedirs(directory, exist_ok=True)