sentence = "Hello, my son is cuting."
input_ids_method1 = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)) # Batch size 1 //一次性进行分词和id映射
# tensor([ 101, 7592, 1010, 2026, 2365, 2003, 3013, 2075, 1012, 102])
input_token2 = tokenizer.tokenize(sentence) //进行word piece分词
# ['hello', ',', 'my', 'son', 'is', 'cut', '##ing', '.']
input_ids_method2 = tokenizer.convert_tokens_to_ids(input_token2) // 将分词转为分词对应的ids
# tensor([7592, 1010, 2026, 2365, 2003, 3013, 2075, 1012])
# 并没有开头和结尾的标记:[cls]、[sep]
(当tokenizer.encode函数中的add_special_tokens设置为False时,同样不会出现开头和结尾标记:[cls], [sep]。)
print(tokenizer.encode_plus(sentence)) // encode_plus除了输出ids,和type mask三个字典
[101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102] {'input_ids': [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}标签:transformers,tokenizer,sentence,7592,ids,input,1012 From: https://www.cnblogs.com/qiaoqifa/p/17484861.html