Add WAV file support with MP3 conversion and reuse
- Add support for WAV files with automatic conversion to MP3 - Save converted MP3 files in the same directory as WAV files - Reuse existing MP3 files if already converted - Update documentation and requirements
This commit is contained in:
@@ -1,10 +1,13 @@
|
|||||||
# mknotes
|
# mknotes
|
||||||
|
|
||||||
A command-line tool to transcribe all MP3 and M4A audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model.
|
A command-line tool to transcribe all MP3, M4A, and WAV audio files in a directory using Faster Whisper, then enhance the transcriptions into comprehensive notes using OpenAI's GPT-4.1 model.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Batch transcribes all `.mp3` and `.m4a` files in a specified directory
|
- Batch transcribes all `.mp3`, `.m4a`, and `.wav` files in a specified directory
|
||||||
|
- Automatically converts WAV files to MP3 format before processing
|
||||||
|
- Converted MP3 files are saved in the same directory as the original WAV files
|
||||||
|
- Reuses existing MP3 files if they've already been converted
|
||||||
- Saves transcriptions as `.txt` files
|
- Saves transcriptions as `.txt` files
|
||||||
- Enhances notes using GPT-4.1 with a custom prompt
|
- Enhances notes using GPT-4.1 with a custom prompt
|
||||||
- Outputs enhanced notes in markdown format
|
- Outputs enhanced notes in markdown format
|
||||||
@@ -23,6 +26,16 @@ source venv/bin/activate # On Windows: venv\Scripts\activate
|
|||||||
|
|
||||||
# Install dependencies
|
# Install dependencies
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Install ffmpeg (required for WAV to MP3 conversion)
|
||||||
|
# On Ubuntu/Debian:
|
||||||
|
# sudo apt-get install ffmpeg
|
||||||
|
|
||||||
|
# On macOS with Homebrew:
|
||||||
|
# brew install ffmpeg
|
||||||
|
|
||||||
|
# On Windows:
|
||||||
|
# Download from https://ffmpeg.org/download.html and add to PATH
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
@@ -32,7 +45,7 @@ export OPENAI_API_KEY="your-api-key-here"
|
|||||||
python main.py --input-dir /path/to/audio/files --output-dir /path/to/output [--turbo]
|
python main.py --input-dir /path/to/audio/files --output-dir /path/to/output [--turbo]
|
||||||
```
|
```
|
||||||
|
|
||||||
- `--input-dir`: Directory containing audio files (required)
|
- `--input-dir`: Directory containing audio files (.mp3, .m4a, .wav) (required)
|
||||||
- `--output-dir`: Directory for output files (default: "output")
|
- `--output-dir`: Directory for output files (default: "output")
|
||||||
- `--turbo`: Enable turbo mode for faster inference (uses int8_float16 compute type)
|
- `--turbo`: Enable turbo mode for faster inference (uses int8_float16 compute type)
|
||||||
- `--force`: Force re-processing of files even if output files already exist
|
- `--force`: Force re-processing of files even if output files already exist
|
||||||
@@ -62,3 +75,5 @@ If a compute type is not supported, the program will try the next one in the lis
|
|||||||
- Python 3.8+
|
- Python 3.8+
|
||||||
- [Faster Whisper](https://github.com/SYSTRAN/faster-whisper)
|
- [Faster Whisper](https://github.com/SYSTRAN/faster-whisper)
|
||||||
- [OpenAI Python SDK](https://github.com/openai/openai-python)
|
- [OpenAI Python SDK](https://github.com/openai/openai-python)
|
||||||
|
- [pydub](https://github.com/jiaaro/pydub) (for WAV to MP3 conversion)
|
||||||
|
- [ffmpeg](https://ffmpeg.org/) (required by pydub for audio conversion)
|
||||||
|
|||||||
@@ -4,9 +4,10 @@ This document outlines proposed improvements for the mknotes software, grouped b
|
|||||||
|
|
||||||
## Feature Enhancements
|
## Feature Enhancements
|
||||||
|
|
||||||
- **Support for More Audio Formats**
|
- **Support for More Audio Formats** ✅ (Partially implemented - WAV support added)
|
||||||
- Extend support beyond MP3 and M4A to include formats like WAV, FLAC, OGG, etc.
|
- ✅ Added support for WAV files with automatic conversion to MP3 before processing
|
||||||
- Update the `find_audio_files` function in `utils.py` to recognise additional extensions.
|
- Extend support to include additional formats like FLAC, OGG, etc.
|
||||||
|
- ✅ Updated the `find_audio_files` function in `utils.py` to recognize WAV extension
|
||||||
|
|
||||||
- **Customizable Enhancement Prompts**
|
- **Customizable Enhancement Prompts**
|
||||||
- Allow users to provide their own prompts via CLI arguments or configuration files.
|
- Allow users to provide their own prompts via CLI arguments or configuration files.
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import os
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from src.cli import parse_args
|
from src.cli import parse_args
|
||||||
from src.utils import find_audio_files, ensure_directory_exists
|
from src.utils import find_audio_files, ensure_directory_exists, convert_wav_to_mp3
|
||||||
from src.transcriber import transcribe_audio
|
from src.transcriber import transcribe_audio
|
||||||
from src.enhancer import enhance_note
|
from src.enhancer import enhance_note
|
||||||
|
|
||||||
@@ -39,9 +39,20 @@ def main():
|
|||||||
with open(txt_path, "r", encoding="utf-8") as f:
|
with open(txt_path, "r", encoding="utf-8") as f:
|
||||||
transcription = f.read()
|
transcription = f.read()
|
||||||
else:
|
else:
|
||||||
|
# Convert WAV to MP3 if needed
|
||||||
|
path_to_transcribe = audio_path
|
||||||
|
|
||||||
|
if audio_path.lower().endswith(".wav"):
|
||||||
|
print(f"Processing WAV file: {audio_path}")
|
||||||
|
mp3_path = convert_wav_to_mp3(audio_path)
|
||||||
|
if mp3_path:
|
||||||
|
path_to_transcribe = mp3_path
|
||||||
|
else:
|
||||||
|
print(f"Warning: Failed to convert {audio_path}. Will attempt to transcribe the WAV file directly.")
|
||||||
|
|
||||||
# Transcribe audio
|
# Transcribe audio
|
||||||
transcription = transcribe_audio(
|
transcription = transcribe_audio(
|
||||||
audio_path,
|
path_to_transcribe,
|
||||||
txt_path,
|
txt_path,
|
||||||
model_size=model_size,
|
model_size=model_size,
|
||||||
turbo=args.turbo
|
turbo=args.turbo
|
||||||
|
|||||||
@@ -2,3 +2,4 @@ faster-whisper
|
|||||||
openai
|
openai
|
||||||
argparse
|
argparse
|
||||||
tqdm
|
tqdm
|
||||||
|
pydub
|
||||||
|
|||||||
+1
-1
@@ -10,7 +10,7 @@ def parse_args():
|
|||||||
"--input-dir",
|
"--input-dir",
|
||||||
type=str,
|
type=str,
|
||||||
required=True,
|
required=True,
|
||||||
help="Directory containing audio files (.mp3, .m4a)"
|
help="Directory containing audio files (.mp3, .m4a, .wav)"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output-dir",
|
"--output-dir",
|
||||||
|
|||||||
+33
-1
@@ -1,8 +1,10 @@
|
|||||||
# Helper functions for mknotes
|
# Helper functions for mknotes
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
def find_audio_files(directory, extensions=(".mp3", ".m4a")):
|
def find_audio_files(directory, extensions=(".mp3", ".m4a", ".wav")):
|
||||||
"""
|
"""
|
||||||
Recursively find all audio files in the given directory with the specified extensions.
|
Recursively find all audio files in the given directory with the specified extensions.
|
||||||
Returns a list of file paths.
|
Returns a list of file paths.
|
||||||
@@ -19,3 +21,33 @@ def ensure_directory_exists(directory):
|
|||||||
Create the directory if it does not exist.
|
Create the directory if it does not exist.
|
||||||
"""
|
"""
|
||||||
os.makedirs(directory, exist_ok=True)
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
|
||||||
|
def convert_wav_to_mp3(wav_path):
|
||||||
|
"""
|
||||||
|
Convert a WAV file to MP3 format using pydub if an MP3 version doesn't already exist.
|
||||||
|
Saves the MP3 file in the same directory as the WAV file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
wav_path: Path to the WAV file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the MP3 file
|
||||||
|
"""
|
||||||
|
# Determine the output MP3 path (same directory, same name, different extension)
|
||||||
|
base_name = os.path.splitext(wav_path)[0]
|
||||||
|
mp3_path = base_name + ".mp3"
|
||||||
|
|
||||||
|
# Check if MP3 version already exists
|
||||||
|
if os.path.exists(mp3_path):
|
||||||
|
print(f"Using existing MP3 file: {mp3_path}")
|
||||||
|
return mp3_path
|
||||||
|
|
||||||
|
# Convert WAV to MP3
|
||||||
|
try:
|
||||||
|
audio = AudioSegment.from_wav(wav_path)
|
||||||
|
audio.export(mp3_path, format="mp3")
|
||||||
|
print(f"Converted {wav_path} to MP3 format at {mp3_path}")
|
||||||
|
return mp3_path
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error converting {wav_path} to MP3: {e}")
|
||||||
|
return None
|
||||||
|
|||||||
Reference in New Issue
Block a user