transformers

发布时间 2023-06-16 10:05:49作者: 15375357604

sentence = "Hello, my son is cuting."
input_ids_method1 = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)) # Batch size 1   //一次性进行分词和id映射
# tensor([ 101, 7592, 1010, 2026, 2365, 2003, 3013, 2075, 1012, 102])

input_token2 = tokenizer.tokenize(sentence)  //进行word piece分词
# ['hello', ',', 'my', 'son', 'is', 'cut', '##ing', '.']


input_ids_method2 = tokenizer.convert_tokens_to_ids(input_token2)  // 将分词转为分词对应的ids
# tensor([7592, 1010, 2026, 2365, 2003, 3013, 2075, 1012])
# 并没有开头和结尾的标记:[cls]、[sep]

(当tokenizer.encode函数中的add_special_tokens设置为False时,同样不会出现开头和结尾标记:[cls], [sep]。)

 

print(tokenizer.encode_plus(sentence)) // encode_plus除了输出ids,和type mask三个字典
[101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102]
{'input_ids': [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}