From ea4cedc5884ddbf18da82dc088f33a3ae980f1c6 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 13 Aug 2023 02:00:19 +0000 Subject: [PATCH] add ability to export custom tokenizer to .bin format for run.c file --- tokenizer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tokenizer.py b/tokenizer.py index 981b2ac..bc2a35a 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -4,7 +4,7 @@ import os import struct -from logging import getLogger +import argparse from typing import List from sentencepiece import SentencePieceProcessor @@ -72,5 +72,9 @@ class Tokenizer: f.write(bytes) if __name__ == "__main__": - t = Tokenizer() + parser = argparse.ArgumentParser() + parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ") + args = parser.parse_args() + + t = Tokenizer(args.tokenizer_model) t.export()