from transformers import BlipProcessor, BlipTextConfig
from transformers.models.blip.modeling_blip_text import BlipTextLMHeadModel
from transformers import AutoTokenizer
model = BlipForConditionalGeneration.from_pretrained("huggingface.co/Salesforce/blip-image-captioning-base")
text_config = BlipTextConfig()
model.text_decoder = BlipTextLMHeadModel(text_config)
实际训练的时候就可以用BERT的tokenizer处理encode和decode
bertTokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
output_batch = model.generate(pixel_values=pixel_input_batch)
for i in range(0, batch_size):
caption = bertTokenizer.decode(output_batch[i], skip_special_tokens=True)
print(caption)