mknotes/src/transcriber.py

# Audio transcription logic using Faster Whisper

from faster_whisper import WhisperModel

def transcribe_audio(input_path, output_path, model_size="medium", turbo=False):
    """
    Transcribe an audio file using Faster Whisper and save the result as a .txt file.

    Args:
        input_path: Path to the audio file
        output_path: Path to save the transcription
        model_size: Size of the Whisper model to use
        turbo: Whether to use int8_float16 compute type for faster inference
    """
    # Try compute types in order of preference
    if turbo:
        compute_types = ["int8_float16", "float16", "int8", "float32"]
    else:
        compute_types = ["float16", "int8", "float32"]

    model = None
    for compute_type in compute_types:
        try:
            print(f"Attempting to initialize model with compute_type: {compute_type}")
            model = WhisperModel(model_size, compute_type=compute_type)
            print(f"Successfully initialized model with compute_type: {compute_type}")
            break
        except ValueError as e:
            if "compute type" in str(e) and compute_type != compute_types[-1]:
                print(f"Warning: {compute_type} compute type not supported by your hardware.")
                print(f"Trying next compute type...")
            else:
                raise

    if model is None:
        raise RuntimeError("Failed to initialize Whisper model with any compute type")

    segments, info = model.transcribe(input_path)
    transcription = ""
    for segment in segments:
        transcription += segment.text.strip() + "\n"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(transcription.strip())
    return transcription.strip()