From c47089aa0d12db1c77aec15536e1dd316c7cab81 Mon Sep 17 00:00:00 2001
From: Heiko Joerg Schick <info@schihei.de>
Date: Thu, 22 May 2025 20:53:39 +0200
Subject: [PATCH] Add WAV file support with MP3 conversion and reuse

- Add support for WAV files with automatic conversion to MP3
- Save converted MP3 files in the same directory as WAV files
- Reuse existing MP3 files if already converted
- Update documentation and requirements
---
 README.md                | 21 ++++++++++++++++++---
 enhancement_proposals.md |  7 ++++---
 main.py                  | 17 ++++++++++++++---
 requirements.txt         |  1 +
 src/cli.py               |  2 +-
 src/utils.py             | 34 +++++++++++++++++++++++++++++++++-
 6 files changed, 71 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 326400c..a625704 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,13 @@
 # mknotes
 
-A command-line tool to transcribe all MP3 and M4A audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model.
+A command-line tool to transcribe all MP3, M4A, and WAV audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model.
 
 ## Features
 
-- Batch transcribes all `.mp3` and `.m4a` files in a specified directory
+- Batch transcribes all `.mp3`, `.m4a`, and `.wav` files in a specified directory
+- Automatically converts WAV files to MP3 format before processing
+  - Converted MP3 files are saved in the same directory as the original WAV files
+  - Reuses existing MP3 files if they've already been converted
 - Saves transcriptions as `.txt` files
 - Enhances notes using GPT-4.1 with a custom prompt
 - Outputs enhanced notes in markdown format
@@ -23,6 +26,16 @@ source venv/bin/activate  # On Windows: venv\Scripts\activate
 
 # Install dependencies
 pip install -r requirements.txt
+
+# Install ffmpeg (required for WAV to MP3 conversion)
+# On Ubuntu/Debian:
+# sudo apt-get install ffmpeg
+
+# On macOS with Homebrew:
+# brew install ffmpeg
+
+# On Windows:
+# Download from https://ffmpeg.org/download.html and add to PATH
 ```
 
 ## Usage
@@ -32,7 +45,7 @@ export OPENAI_API_KEY="your-api-key-here"
 python main.py --input-dir /path/to/audio/files --output-dir /path/to/output [--turbo]
 ```
 
-- `--input-dir`: Directory containing audio files (required)
+- `--input-dir`: Directory containing audio files (.mp3, .m4a, .wav) (required)
 - `--output-dir`: Directory for output files (default: "output")
 - `--turbo`: Enable turbo mode for faster inference (uses int8_float16 compute type)
 - `--force`: Force re-processing of files even if output files already exist
@@ -62,3 +75,5 @@ If a compute type is not supported, the program will try the next one in the lis
 - Python 3.8+
 - [Faster Whisper](https://github.com/SYSTRAN/faster-whisper)
 - [OpenAI Python SDK](https://github.com/openai/openai-python)
+- [pydub](https://github.com/jiaaro/pydub) (for WAV to MP3 conversion)
+- [ffmpeg](https://ffmpeg.org/) (required by pydub for audio conversion)
diff --git a/enhancement_proposals.md b/enhancement_proposals.md
index 4a10e17..d532e71 100644
--- a/enhancement_proposals.md
+++ b/enhancement_proposals.md
@@ -4,9 +4,10 @@ This document outlines proposed improvements for the mknotes software, grouped b
 
 ## Feature Enhancements
 
-- **Support for More Audio Formats**
-  - Extend support beyond MP3 and M4A to include formats like WAV, FLAC, OGG, etc.
-  - Update the `find_audio_files` function in `utils.py` to recognise additional extensions.
+- **Support for More Audio Formats** ✅ (Partially implemented - WAV support added)
+  - ✅ Added support for WAV files with automatic conversion to MP3 before processing
+  - Extend support to include additional formats like FLAC, OGG, etc.
+  - ✅ Updated the `find_audio_files` function in `utils.py` to recognize WAV extension
 
 - **Customizable Enhancement Prompts**
   - Allow users to provide their own prompts via CLI arguments or configuration files.
diff --git a/main.py b/main.py
index c43b4db..9c94993 100644
--- a/main.py
+++ b/main.py
@@ -4,7 +4,7 @@ import os
 from tqdm import tqdm
 
 from src.cli import parse_args
-from src.utils import find_audio_files, ensure_directory_exists
+from src.utils import find_audio_files, ensure_directory_exists, convert_wav_to_mp3
 from src.transcriber import transcribe_audio
 from src.enhancer import enhance_note
 
@@ -27,7 +27,7 @@ def main():
         base_name = os.path.splitext(os.path.basename(audio_path))[0]
         txt_path = os.path.join(output_dir, base_name + ".txt")
         md_path = os.path.join(output_dir, base_name + ".md")
-
+        
         # Skip if enhanced note already exists (unless force flag is set)
         if os.path.exists(md_path) and not args.force:
             print(f"Skipping {audio_path} - enhanced note already exists at {md_path}")
@@ -39,9 +39,20 @@ def main():
             with open(txt_path, "r", encoding="utf-8") as f:
                 transcription = f.read()
         else:
+            # Convert WAV to MP3 if needed
+            path_to_transcribe = audio_path
+            
+            if audio_path.lower().endswith(".wav"):
+                print(f"Processing WAV file: {audio_path}")
+                mp3_path = convert_wav_to_mp3(audio_path)
+                if mp3_path:
+                    path_to_transcribe = mp3_path
+                else:
+                    print(f"Warning: Failed to convert {audio_path}. Will attempt to transcribe the WAV file directly.")
+            
             # Transcribe audio
             transcription = transcribe_audio(
-                audio_path,
+                path_to_transcribe,
                 txt_path,
                 model_size=model_size,
                 turbo=args.turbo
diff --git a/requirements.txt b/requirements.txt
index 8852add..90a05f1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ faster-whisper
 openai
 argparse
 tqdm
+pydub
diff --git a/src/cli.py b/src/cli.py
index 10e3d19..36b159d 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -10,7 +10,7 @@ def parse_args():
         "--input-dir",
         type=str,
         required=True,
-        help="Directory containing audio files (.mp3, .m4a)"
+        help="Directory containing audio files (.mp3, .m4a, .wav)"
     )
     parser.add_argument(
         "--output-dir",
diff --git a/src/utils.py b/src/utils.py
index 9802897..ad3a8c4 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -1,8 +1,10 @@
 # Helper functions for mknotes
 
 import os
+import tempfile
+from pydub import AudioSegment
 
-def find_audio_files(directory, extensions=(".mp3", ".m4a")):
+def find_audio_files(directory, extensions=(".mp3", ".m4a", ".wav")):
     """
     Recursively find all audio files in the given directory with the specified extensions.
     Returns a list of file paths.
@@ -19,3 +21,33 @@ def ensure_directory_exists(directory):
     Create the directory if it does not exist.
     """
     os.makedirs(directory, exist_ok=True)
+
+def convert_wav_to_mp3(wav_path):
+    """
+    Convert a WAV file to MP3 format using pydub if an MP3 version doesn't already exist.
+    Saves the MP3 file in the same directory as the WAV file.
+    
+    Args:
+        wav_path: Path to the WAV file
+        
+    Returns:
+        Path to the MP3 file
+    """
+    # Determine the output MP3 path (same directory, same name, different extension)
+    base_name = os.path.splitext(wav_path)[0]
+    mp3_path = base_name + ".mp3"
+    
+    # Check if MP3 version already exists
+    if os.path.exists(mp3_path):
+        print(f"Using existing MP3 file: {mp3_path}")
+        return mp3_path
+    
+    # Convert WAV to MP3
+    try:
+        audio = AudioSegment.from_wav(wav_path)
+        audio.export(mp3_path, format="mp3")
+        print(f"Converted {wav_path} to MP3 format at {mp3_path}")
+        return mp3_path
+    except Exception as e:
+        print(f"Error converting {wav_path} to MP3: {e}")
+        return None