目录
函数原型
def __call__(self,
text: Union[str, List[str], List[List[str]]],
text_pair: Optional[Union[str, List[str],
List[List[str]]]] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: Union[bool, str] = False,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
return_position_ids: bool = False,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_length: bool = False,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_dict: bool = True,
return_offsets_mapping: bool = False,
add_special_tokens: bool = True,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
**kwargs):
函数含义
Performs tokenization and uses the tokenized tokens to prepare model inputs. It supports sequence or sequence pair as input, and batch input is allowed. self.encode()
or self.batch_encode()
would be called separately for single or batch input depending on input format andis_split_into_words
argument.