Decoding improvements (#1033)

* suppress task tokens (transcribe/translate)

* not ignoring the last segment ending with one timestamp
This commit is contained in:
Jong Wook Kim
2023-03-06 14:32:32 -05:00
committed by GitHub
parent 3e1780fd37
commit eab8d920ed
3 changed files with 30 additions and 16 deletions
+8
View File
@@ -160,6 +160,14 @@ class Tokenizer:
def eot(self) -> int:
return self.tokenizer.eos_token_id
@cached_property
def transcribe(self) -> int:
return self._get_single_token_id("<|transcribe|>")
@cached_property
def translate(self) -> int:
return self._get_single_token_id("<|translate|>")
@cached_property
def sot(self) -> int:
return self._get_single_token_id("<|startoftranscript|>")