small format tweaks, get rid of prints in tokenizer
This commit is contained in:
+1
-4
@@ -27,8 +27,5 @@ for line in proc.stdout:
|
|||||||
last = dec
|
last = dec
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
|
|
||||||
print('\n---\n')
|
print(f"\nachieved tok/s: {len(tokens) / (t1 - t0)}")
|
||||||
print(enc.decode(tokens))
|
|
||||||
|
|
||||||
print(f"achieved tok/s: {len(tokens) / (t1 - t0)}")
|
|
||||||
proc.wait()
|
proc.wait()
|
||||||
|
|||||||
+2
-4
@@ -15,16 +15,14 @@ class Tokenizer:
|
|||||||
model_path = TOKENIZER_MODEL
|
model_path = TOKENIZER_MODEL
|
||||||
assert os.path.isfile(model_path), model_path
|
assert os.path.isfile(model_path), model_path
|
||||||
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
||||||
print(f"Loaded SentencePiece model from {model_path}")
|
#print(f"Loaded SentencePiece model from {model_path}")
|
||||||
|
|
||||||
# BOS / EOS token IDs
|
# BOS / EOS token IDs
|
||||||
self.n_words: int = self.sp_model.vocab_size()
|
self.n_words: int = self.sp_model.vocab_size()
|
||||||
self.bos_id: int = self.sp_model.bos_id()
|
self.bos_id: int = self.sp_model.bos_id()
|
||||||
self.eos_id: int = self.sp_model.eos_id()
|
self.eos_id: int = self.sp_model.eos_id()
|
||||||
self.pad_id: int = self.sp_model.pad_id()
|
self.pad_id: int = self.sp_model.pad_id()
|
||||||
print(
|
#print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
|
||||||
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
|
|
||||||
)
|
|
||||||
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
|
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
|
||||||
|
|
||||||
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
|
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
|
||||||
|
|||||||
Reference in New Issue
Block a user