@dataclass
class DataCollatorWithPadding:
"""
Data collator that will dynamically pad the inputs to the longest sequence in the batch.
Args:
tokenizer (`paddlenlp.transformers.PretrainedTokenizer`):
The tokenizer used for encoding the data.
"""
tokenizer: PretrainedTokenizerBase
padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
return_tensors: str = "pd"
return_attention_mask: Optional[bool] = None
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
batch = self.tokenizer.pad(
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors,
return_attention_mask=self.return_attention_mask)
if "label" in batch:
batch["labels"] = batch["label"]
del batch["label"]
if "label_ids" in batch:
batch["labels"] = batch["label_ids"]
del batch["label_ids"]
return batch
标签:return,tokenizer,self,batch,label,pad,DataCollatorWithPadding
From: https://www.cnblogs.com/zjuhaohaoxuexi/p/16950829.html