From c47089aa0d12db1c77aec15536e1dd316c7cab81 Mon Sep 17 00:00:00 2001 From: Heiko Joerg Schick Date: Thu, 22 May 2025 20:53:39 +0200 Subject: [PATCH] Add WAV file support with MP3 conversion and reuse - Add support for WAV files with automatic conversion to MP3 - Save converted MP3 files in the same directory as WAV files - Reuse existing MP3 files if already converted - Update documentation and requirements --- README.md | 21 ++++++++++++++++++--- enhancement_proposals.md | 7 ++++--- main.py | 17 ++++++++++++++--- requirements.txt | 1 + src/cli.py | 2 +- src/utils.py | 34 +++++++++++++++++++++++++++++++++- 6 files changed, 71 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 326400c..a625704 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,13 @@ # mknotes -A command-line tool to transcribe all MP3 and M4A audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model. +A command-line tool to transcribe all MP3, M4A, and WAV audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model. ## Features -- Batch transcribes all `.mp3` and `.m4a` files in a specified directory +- Batch transcribes all `.mp3`, `.m4a`, and `.wav` files in a specified directory +- Automatically converts WAV files to MP3 format before processing + - Converted MP3 files are saved in the same directory as the original WAV files + - Reuses existing MP3 files if they've already been converted - Saves transcriptions as `.txt` files - Enhances notes using GPT-4.1 with a custom prompt - Outputs enhanced notes in markdown format @@ -23,6 +26,16 @@ source venv/bin/activate # On Windows: venv\Scripts\activate # Install dependencies pip install -r requirements.txt + +# Install ffmpeg (required for WAV to MP3 conversion) +# On Ubuntu/Debian: +# sudo apt-get install ffmpeg + +# On macOS with Homebrew: +# brew install ffmpeg + +# On Windows: +# Download from https://ffmpeg.org/download.html and add to PATH ``` ## Usage @@ -32,7 +45,7 @@ export OPENAI_API_KEY="your-api-key-here" python main.py --input-dir /path/to/audio/files --output-dir /path/to/output [--turbo] ``` -- `--input-dir`: Directory containing audio files (required) +- `--input-dir`: Directory containing audio files (.mp3, .m4a, .wav) (required) - `--output-dir`: Directory for output files (default: "output") - `--turbo`: Enable turbo mode for faster inference (uses int8_float16 compute type) - `--force`: Force re-processing of files even if output files already exist @@ -62,3 +75,5 @@ If a compute type is not supported, the program will try the next one in the lis - Python 3.8+ - [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) - [OpenAI Python SDK](https://github.com/openai/openai-python) +- [pydub](https://github.com/jiaaro/pydub) (for WAV to MP3 conversion) +- [ffmpeg](https://ffmpeg.org/) (required by pydub for audio conversion) diff --git a/enhancement_proposals.md b/enhancement_proposals.md index 4a10e17..d532e71 100644 --- a/enhancement_proposals.md +++ b/enhancement_proposals.md @@ -4,9 +4,10 @@ This document outlines proposed improvements for the mknotes software, grouped b ## Feature Enhancements -- **Support for More Audio Formats** - - Extend support beyond MP3 and M4A to include formats like WAV, FLAC, OGG, etc. - - Update the `find_audio_files` function in `utils.py` to recognise additional extensions. +- **Support for More Audio Formats** ✅ (Partially implemented - WAV support added) + - ✅ Added support for WAV files with automatic conversion to MP3 before processing + - Extend support to include additional formats like FLAC, OGG, etc. + - ✅ Updated the `find_audio_files` function in `utils.py` to recognize WAV extension - **Customizable Enhancement Prompts** - Allow users to provide their own prompts via CLI arguments or configuration files. diff --git a/main.py b/main.py index c43b4db..9c94993 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,7 @@ import os from tqdm import tqdm from src.cli import parse_args -from src.utils import find_audio_files, ensure_directory_exists +from src.utils import find_audio_files, ensure_directory_exists, convert_wav_to_mp3 from src.transcriber import transcribe_audio from src.enhancer import enhance_note @@ -27,7 +27,7 @@ def main(): base_name = os.path.splitext(os.path.basename(audio_path))[0] txt_path = os.path.join(output_dir, base_name + ".txt") md_path = os.path.join(output_dir, base_name + ".md") - + # Skip if enhanced note already exists (unless force flag is set) if os.path.exists(md_path) and not args.force: print(f"Skipping {audio_path} - enhanced note already exists at {md_path}") @@ -39,9 +39,20 @@ def main(): with open(txt_path, "r", encoding="utf-8") as f: transcription = f.read() else: + # Convert WAV to MP3 if needed + path_to_transcribe = audio_path + + if audio_path.lower().endswith(".wav"): + print(f"Processing WAV file: {audio_path}") + mp3_path = convert_wav_to_mp3(audio_path) + if mp3_path: + path_to_transcribe = mp3_path + else: + print(f"Warning: Failed to convert {audio_path}. Will attempt to transcribe the WAV file directly.") + # Transcribe audio transcription = transcribe_audio( - audio_path, + path_to_transcribe, txt_path, model_size=model_size, turbo=args.turbo diff --git a/requirements.txt b/requirements.txt index 8852add..90a05f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ faster-whisper openai argparse tqdm +pydub diff --git a/src/cli.py b/src/cli.py index 10e3d19..36b159d 100644 --- a/src/cli.py +++ b/src/cli.py @@ -10,7 +10,7 @@ def parse_args(): "--input-dir", type=str, required=True, - help="Directory containing audio files (.mp3, .m4a)" + help="Directory containing audio files (.mp3, .m4a, .wav)" ) parser.add_argument( "--output-dir", diff --git a/src/utils.py b/src/utils.py index 9802897..ad3a8c4 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,8 +1,10 @@ # Helper functions for mknotes import os +import tempfile +from pydub import AudioSegment -def find_audio_files(directory, extensions=(".mp3", ".m4a")): +def find_audio_files(directory, extensions=(".mp3", ".m4a", ".wav")): """ Recursively find all audio files in the given directory with the specified extensions. Returns a list of file paths. @@ -19,3 +21,33 @@ def ensure_directory_exists(directory): Create the directory if it does not exist. """ os.makedirs(directory, exist_ok=True) + +def convert_wav_to_mp3(wav_path): + """ + Convert a WAV file to MP3 format using pydub if an MP3 version doesn't already exist. + Saves the MP3 file in the same directory as the WAV file. + + Args: + wav_path: Path to the WAV file + + Returns: + Path to the MP3 file + """ + # Determine the output MP3 path (same directory, same name, different extension) + base_name = os.path.splitext(wav_path)[0] + mp3_path = base_name + ".mp3" + + # Check if MP3 version already exists + if os.path.exists(mp3_path): + print(f"Using existing MP3 file: {mp3_path}") + return mp3_path + + # Convert WAV to MP3 + try: + audio = AudioSegment.from_wav(wav_path) + audio.export(mp3_path, format="mp3") + print(f"Converted {wav_path} to MP3 format at {mp3_path}") + return mp3_path + except Exception as e: + print(f"Error converting {wav_path} to MP3: {e}") + return None