Source code for espnet2.text.whisper_token_id_converter

from typing import Iterable, List, Union

import numpy as np
from typeguard import check_argument_types

# <sos> and <eos> for Whisper multilingual ---
# '<|startoftranscript|>': 50258
# '<|endoftext|>':         50257

# <sos> and <eos> for Whisper english ---
# '<|startoftranscript|>': 50257
# '<|endoftext|>':         50256


[docs]class OpenAIWhisperTokenIDConverter: def __init__( self, model_type: str = "whisper_multilingual", ): assert check_argument_types() try: import whisper.tokenizer except Exception as e: print("Error: whisper is not properly installed.") print( "Please install whisper with: cd ${MAIN_ROOT}/tools && " "./installers/install_whisper.sh" ) raise e if model_type == "whisper_en": self.tokenizer = whisper.tokenizer.get_tokenizer(multilingual=False) # TODO(Shih-Lun): should support feeding in # different languages (default is en) elif model_type == "whisper_multilingual": self.tokenizer = whisper.tokenizer.get_tokenizer( multilingual=True, language=None ) else: raise ValueError("tokenizer unsupported:", model_type)
[docs] def get_num_vocabulary_size(self) -> int: return self.tokenizer.tokenizer.vocab_size + len( self.tokenizer.tokenizer.get_added_vocab() )
[docs] def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]: return self.tokenizer.tokenizer.convert_ids_to_tokens( integers, skip_special_tokens=True )
[docs] def tokens2ids(self, tokens: Iterable[str]) -> List[int]: return list( self.tokenizer.sot_sequence_including_notimestamps[1:] ) + self.tokenizer.tokenizer.convert_tokens_to_ids(tokens)