Add WAV file support with MP3 conversion and reuse

- Add support for WAV files with automatic conversion to MP3
- Save converted MP3 files in the same directory as WAV files
- Reuse existing MP3 files if already converted
- Update documentation and requirements
This commit is contained in:
2025-05-22 20:53:39 +02:00
parent b6e6e80cfb
commit c47089aa0d
6 changed files with 71 additions and 11 deletions
+18 -3
View File
@@ -1,10 +1,13 @@
# mknotes # mknotes
A command-line tool to transcribe all MP3 and M4A audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model. A command-line tool to transcribe all MP3, M4A, and WAV audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model.
## Features ## Features
- Batch transcribes all `.mp3` and `.m4a` files in a specified directory - Batch transcribes all `.mp3`, `.m4a`, and `.wav` files in a specified directory
- Automatically converts WAV files to MP3 format before processing
- Converted MP3 files are saved in the same directory as the original WAV files
- Reuses existing MP3 files if they've already been converted
- Saves transcriptions as `.txt` files - Saves transcriptions as `.txt` files
- Enhances notes using GPT-4.1 with a custom prompt - Enhances notes using GPT-4.1 with a custom prompt
- Outputs enhanced notes in markdown format - Outputs enhanced notes in markdown format
@@ -23,6 +26,16 @@ source venv/bin/activate # On Windows: venv\Scripts\activate
# Install dependencies # Install dependencies
pip install -r requirements.txt pip install -r requirements.txt
# Install ffmpeg (required for WAV to MP3 conversion)
# On Ubuntu/Debian:
# sudo apt-get install ffmpeg
# On macOS with Homebrew:
# brew install ffmpeg
# On Windows:
# Download from https://ffmpeg.org/download.html and add to PATH
``` ```
## Usage ## Usage
@@ -32,7 +45,7 @@ export OPENAI_API_KEY="your-api-key-here"
python main.py --input-dir /path/to/audio/files --output-dir /path/to/output [--turbo] python main.py --input-dir /path/to/audio/files --output-dir /path/to/output [--turbo]
``` ```
- `--input-dir`: Directory containing audio files (required) - `--input-dir`: Directory containing audio files (.mp3, .m4a, .wav) (required)
- `--output-dir`: Directory for output files (default: "output") - `--output-dir`: Directory for output files (default: "output")
- `--turbo`: Enable turbo mode for faster inference (uses int8_float16 compute type) - `--turbo`: Enable turbo mode for faster inference (uses int8_float16 compute type)
- `--force`: Force re-processing of files even if output files already exist - `--force`: Force re-processing of files even if output files already exist
@@ -62,3 +75,5 @@ If a compute type is not supported, the program will try the next one in the lis
- Python 3.8+ - Python 3.8+
- [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) - [Faster Whisper](https://github.com/SYSTRAN/faster-whisper)
- [OpenAI Python SDK](https://github.com/openai/openai-python) - [OpenAI Python SDK](https://github.com/openai/openai-python)
- [pydub](https://github.com/jiaaro/pydub) (for WAV to MP3 conversion)
- [ffmpeg](https://ffmpeg.org/) (required by pydub for audio conversion)
+4 -3
View File
@@ -4,9 +4,10 @@ This document outlines proposed improvements for the mknotes software, grouped b
## Feature Enhancements ## Feature Enhancements
- **Support for More Audio Formats** - **Support for More Audio Formats** ✅ (Partially implemented - WAV support added)
- Extend support beyond MP3 and M4A to include formats like WAV, FLAC, OGG, etc. - ✅ Added support for WAV files with automatic conversion to MP3 before processing
- Update the `find_audio_files` function in `utils.py` to recognise additional extensions. - Extend support to include additional formats like FLAC, OGG, etc.
- ✅ Updated the `find_audio_files` function in `utils.py` to recognize WAV extension
- **Customizable Enhancement Prompts** - **Customizable Enhancement Prompts**
- Allow users to provide their own prompts via CLI arguments or configuration files. - Allow users to provide their own prompts via CLI arguments or configuration files.
+14 -3
View File
@@ -4,7 +4,7 @@ import os
from tqdm import tqdm from tqdm import tqdm
from src.cli import parse_args from src.cli import parse_args
from src.utils import find_audio_files, ensure_directory_exists from src.utils import find_audio_files, ensure_directory_exists, convert_wav_to_mp3
from src.transcriber import transcribe_audio from src.transcriber import transcribe_audio
from src.enhancer import enhance_note from src.enhancer import enhance_note
@@ -27,7 +27,7 @@ def main():
base_name = os.path.splitext(os.path.basename(audio_path))[0] base_name = os.path.splitext(os.path.basename(audio_path))[0]
txt_path = os.path.join(output_dir, base_name + ".txt") txt_path = os.path.join(output_dir, base_name + ".txt")
md_path = os.path.join(output_dir, base_name + ".md") md_path = os.path.join(output_dir, base_name + ".md")
# Skip if enhanced note already exists (unless force flag is set) # Skip if enhanced note already exists (unless force flag is set)
if os.path.exists(md_path) and not args.force: if os.path.exists(md_path) and not args.force:
print(f"Skipping {audio_path} - enhanced note already exists at {md_path}") print(f"Skipping {audio_path} - enhanced note already exists at {md_path}")
@@ -39,9 +39,20 @@ def main():
with open(txt_path, "r", encoding="utf-8") as f: with open(txt_path, "r", encoding="utf-8") as f:
transcription = f.read() transcription = f.read()
else: else:
# Convert WAV to MP3 if needed
path_to_transcribe = audio_path
if audio_path.lower().endswith(".wav"):
print(f"Processing WAV file: {audio_path}")
mp3_path = convert_wav_to_mp3(audio_path)
if mp3_path:
path_to_transcribe = mp3_path
else:
print(f"Warning: Failed to convert {audio_path}. Will attempt to transcribe the WAV file directly.")
# Transcribe audio # Transcribe audio
transcription = transcribe_audio( transcription = transcribe_audio(
audio_path, path_to_transcribe,
txt_path, txt_path,
model_size=model_size, model_size=model_size,
turbo=args.turbo turbo=args.turbo
+1
View File
@@ -2,3 +2,4 @@ faster-whisper
openai openai
argparse argparse
tqdm tqdm
pydub
+1 -1
View File
@@ -10,7 +10,7 @@ def parse_args():
"--input-dir", "--input-dir",
type=str, type=str,
required=True, required=True,
help="Directory containing audio files (.mp3, .m4a)" help="Directory containing audio files (.mp3, .m4a, .wav)"
) )
parser.add_argument( parser.add_argument(
"--output-dir", "--output-dir",
+33 -1
View File
@@ -1,8 +1,10 @@
# Helper functions for mknotes # Helper functions for mknotes
import os import os
import tempfile
from pydub import AudioSegment
def find_audio_files(directory, extensions=(".mp3", ".m4a")): def find_audio_files(directory, extensions=(".mp3", ".m4a", ".wav")):
""" """
Recursively find all audio files in the given directory with the specified extensions. Recursively find all audio files in the given directory with the specified extensions.
Returns a list of file paths. Returns a list of file paths.
@@ -19,3 +21,33 @@ def ensure_directory_exists(directory):
Create the directory if it does not exist. Create the directory if it does not exist.
""" """
os.makedirs(directory, exist_ok=True) os.makedirs(directory, exist_ok=True)
def convert_wav_to_mp3(wav_path):
"""
Convert a WAV file to MP3 format using pydub if an MP3 version doesn't already exist.
Saves the MP3 file in the same directory as the WAV file.
Args:
wav_path: Path to the WAV file
Returns:
Path to the MP3 file
"""
# Determine the output MP3 path (same directory, same name, different extension)
base_name = os.path.splitext(wav_path)[0]
mp3_path = base_name + ".mp3"
# Check if MP3 version already exists
if os.path.exists(mp3_path):
print(f"Using existing MP3 file: {mp3_path}")
return mp3_path
# Convert WAV to MP3
try:
audio = AudioSegment.from_wav(wav_path)
audio.export(mp3_path, format="mp3")
print(f"Converted {wav_path} to MP3 format at {mp3_path}")
return mp3_path
except Exception as e:
print(f"Error converting {wav_path} to MP3: {e}")
return None