我正在编写一个用于预处理图像的独立代码 image_processing_vit 但是,我的结果与图书馆的结果不同。以下代码包含两部分:(1) 不使用变压器,(2) 使用变压器。
我不知道我错过了什么。我已阅读代码+询问副驾驶,但它不能解决问题。
请帮助我!谢谢。
import cv2
import numpy as np
cv_img = cv2.imread("1.png")
cv_img_RGB = cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)
np_cv_img_RGB = np.array(cv_img_RGB)
np_cv_img_RGB_resize = cv2.resize(np_cv_img_RGB, (224,224))
np_cv_img_RGB_resize_255 = np_cv_img_RGB_resize/255.0
mean=[0.485, 0.456, 0.406]
std=[0.229, 0.224, 0.225]
np_cv_img_RGB_resize_255[...,0] -= mean[0]
np_cv_img_RGB_resize_255[...,1] -= mean[1]
np_cv_img_RGB_resize_255[...,2] -= mean[2]
np_cv_img_RGB_resize_255[...,0] /= std[0]
np_cv_img_RGB_resize_255[...,1] /= std[1]
np_cv_img_RGB_resize_255[...,2] /= std[2]
print(np_cv_img_RGB_resize_255)
# array([[[-0.49105232, 0.03011204, 0.16505447],
# [-0.49105232, 0.03011204, 0.16505447],
# [-0.50817707, 0.01260504, 0.18248366],
# ...,
# [ 0.38231013, 1.16806723, 1.57681917],
# [ 0.34806062, 1.13305322, 1.54196078],
# [ 0.34806062, 1.13305322, 1.54196078]],
# [[-0.50817707, 0.01260504, 0.14762527],
# [-0.49105232, 0.03011204, 0.14762527],
# [-0.49105232, 0.03011204, 0.19991285],
# ...,
# [ 0.38231013, 1.16806723, 1.57681917],
# [ 0.34806062, 1.13305322, 1.54196078],
# [ 0.34806062, 1.13305322, 1.54196078]],
# [[-0.50817707, 0.01260504, 0.14762527],
# [-0.52530182, -0.00490196, 0.13019608],
# [-0.54242658, -0.02240896, 0.14762527],
# ...,
# [ 0.34806062, 1.16806723, 1.55938998],
# [ 0.33093587, 1.15056022, 1.54196078],
# [ 0.33093587, 1.15056022, 1.54196078]],
# ...,
# [[ 2.14615977, 2.41106443, 1.29795207],
# [ 2.18040928, 2.39355742, 1.33281046],
# [ 1.66666667, 2.02591036, 1.08880174],
# ...,
# [-0.47392756, -0.09243697, -0.07895425],
# [-0.49105232, -0.10994398, -0.09638344],
# [-0.49105232, -0.10994398, -0.09638344]],
# [[ 1.99203699, 2.30602241, 1.40252723],
# [ 2.14615977, 2.41106443, 1.29795207],
# [ 2.18040928, 2.42857143, 1.40252723],
# ...,
# [-0.43967805, -0.05742297, -0.04409586],
# [-0.4225533 , -0.09243697, -0.02666667],
# [-0.45680281, -0.09243697, -0.02666667]],
# [[ 1.63241716, 2.07843137, 1.14108932],
# [ 2.14615977, 2.42857143, 1.26309368],
# [ 2.07766076, 2.37605042, 1.24566449],
# ...,
# [-0.43967805, -0.05742297, -0.04409586],
# [-0.40542855, -0.07492997, -0.00923747],
# [-0.45680281, -0.09243697, -0.02666667]]])
# ######################################################################################
from transformers import ViTFeatureExtractor
feature_extractor = ViTFeatureExtractor(do_resize=True, size=224,
do_normalize=True,
image_mean=(0.485, 0.456, 0.406), image_std=(0.229, 0.224, 0.225)
).from_pretrained("./app/vit")
# in ./app/vit include: config.json, preprocessor_config.json
b = feature_extractor(np_cv_img_RGB)
print(b['pixel_values'][0])
# array([[[-0.25490195, -0.25490195, -0.25490195, ..., 0.14509809,
# 0.13725495, 0.12941182],
# [-0.25490195, -0.26274508, -0.25490195, ..., 0.14509809,
# 0.13725495, 0.12941182],
# [-0.26274508, -0.27058822, -0.29411763, ..., 0.12941182,
# 0.12156868, 0.12156868],
# ...,
# [ 0.9529412 , 0.96862745, 0.7019608 , ..., -0.24705881,
# -0.25490195, -0.25490195],
# [ 0.8039216 , 0.9529412 , 0.7647059 , ..., -0.23137254,
# -0.23137254, -0.23921567],
# [ 0.7254902 , 0.9372549 , 0.8901961 , ..., -0.23137254,
# -0.2235294 , -0.23137254]],
# [[-0.0745098 , -0.0745098 , -0.0745098 , ..., 0.43529415,
# 0.427451 , 0.41960788],
# [-0.0745098 , -0.08235294, -0.0745098 , ..., 0.43529415,
# 0.427451 , 0.41960788],
# [-0.08235294, -0.09019607, -0.11372548, ..., 0.43529415,
# 0.427451 , 0.427451 ],
# ...,
# [ 0.9843137 , 0.99215686, 0.7647059 , ..., -0.12941176,
# -0.1372549 , -0.1372549 ],
# [ 0.85882354, 0.9843137 , 0.8352941 , ..., -0.11372548,
# -0.12156862, -0.12941176],
# [ 0.827451 , 0.99215686, 0.9529412 , ..., -0.11372548,
# -0.12156862, -0.12941176]],
# [[-0.11372548, -0.11372548, -0.09803921, ..., 0.52156866,
# 0.5137255 , 0.5058824 ],
# [-0.11372548, -0.11372548, -0.09803921, ..., 0.52156866,
# 0.5137255 , 0.5058824 ],
# [-0.12156862, -0.12156862, -0.1372549 , ..., 0.5137255 ,
# 0.5058824 , 0.5058824 ],
# ...,
# [ 0.39607847, 0.41176474, 0.26274514, ..., -0.2235294 ,
# -0.23137254, -0.23137254],
# [ 0.33333337, 0.39607847, 0.30980396, ..., -0.20784312,
# -0.19999999, -0.20784312],
# [ 0.28627455, 0.3803922 , 0.3803922 , ..., -0.20784312,
# -0.19215685, -0.19999999]]], dtype=float32)
注意: config.json:
{
"_name_or_path": "google/vit-base-patch16-224-in21k",
"architectures": [
"ViTModel"
],
"attention_probs_dropout_prob": 0.0,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.0,
"hidden_size": 768,
"image_size": 224,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"model_type": "vit",
"num_attention_heads": 12,
"num_channels": 3,
"num_hidden_layers": 12,
"patch_size": 16,
"qkv_bias": true,
"transformers_version": "4.13.0.dev0"
}
preprocessor_config.json
{
"do_normalize": true,
"do_resize": true,
"image_mean": [
0.5,
0.5,
0.5
],
"image_std": [
0.5,
0.5,
0.5
],
"size": 224
}
我尝试更改通道顺序,进行插值,但效果不好
问题在于你加载预训练
ViTFeatureExtractor
的方式。虽然你在代码中手动指定了均值和标准差,但
from_pretrained
方法会从
./app/vit
文件夹加载预处理器配置(
preprocessor_config.json
),覆盖你手动指定的参数。
解决方法:
你有两个选择:
-
修改
preprocessor_config.json
文件: 将image_mean
和image_std
更新为你想要的值:
{
"do_normalize": true,
"do_resize": true,
"image_mean": [
0.485,
0.456,
0.406
],
"image_std": [
0.229,
0.224,
0.225
],
"size": 224
}
-
不要使用
from_pretrained
方法: 直接创建ViTFeatureExtractor
实例,并传递你想要的参数:
from transformers import ViTFeatureExtractor
feature_extractor = ViTFeatureExtractor(
do_resize=True,
size=224,
do_normalize=True,
image_mean=(0.485, 0.456, 0.406),
image_std=(0.229, 0.224, 0.225),
)
b = feature_extractor(np_cv_img_RGB)
print(b['pixel_values'][0])
这样做将使用你指定的均值和标准差,而不是从预处理器配置文件中加载。
建议: 建议使用第二种方法,因为它可以让你更灵活地控制预处理参数,而无需修改配置文件。
标签:python,pytorch,computer-vision,huggingface-transformers,image-preprocessing From: 78776752