量化工具在网上搜索五花八门,很多文章没有说明使用的版本导致无法复现,这里参考了一些写法实现量化,并转为engine。具体实现代码见下方,欢迎各位小伙伴批评指正。
tensorrt安装
参考windows11下安装Tensor RT,并在conda虚拟环境下使用_tensor rt 免费吗-CSDN博客
pycuda安装
参考GPU编程(基于Python和CUDA)(一)——零基础安装pycuda-CSDN博客
代码
版本说明:
tensorrt:8.5.3.1
cuda:11.7
不同trt版本,有些api不一样
import cv2
import glob
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import os
import numpy as np
from PIL import Image
def preprocess_input(image):
"""
图像预处理
"""
image = image / 255.0
image = image - np.array([0.485, 0.456, 0.406])
image = image / np.array([0.229, 0.224, 0.225])
# image -= np.array([0.5, 0.5, 0.5])
# image /= np.array([0.5, 0.5, 0.5])
return image
class YOLOXEntropyCalibrator(trt.IInt8EntropyCalibrator2):
def __init__(self, args, files_path='data', cache_file='YOLOX.cache'):
trt.IInt8EntropyCalibrator2.__init__(self)
self.cache_file = cache_file
self.batch_size = args.batch_size
self.Channel = args.channel
self.Height = args.height
self.Width = args.width
# 获取数据集中图像的路径列表
self.imgs = glob.glob(os.path.join(files_path, '*.jpg'))
# 初始化内存
self.batch_idx = 0
self.max_batch_idx = len(self.imgs) // self.batch_size
self.data_size = trt.volume([self.batch_size, self.Channel, self.Height, self.Width]) * trt.float32.itemsize
self.device_input = cuda.mem_alloc(self.data_size)
def __resize_pic(self, im0, auto=False):
'''
图片读取,resize,padding
:param im: 图像数组
:return: resize,padding后的图像数组,只resize的图像shape H,W,原始图像shape, padding的像素 H,W
'''
h, w, _ = im0.shape
r = max(h / self.Height, w / self.Width)
new_h = int(round(h / r))
new_w = int(round(w / r))
if auto:
ph, pw = np.mod(self.Height - new_h, 32) / 2, np.mod(self.Width - new_w, 32) / 2 # 最小填充
else:
ph, pw = (self.Height - new_h) / 2, (self.Width - new_w) / 2 # 填充到正方形
pt = int(round(ph - 0.1))
pb = int(round(ph + 0.1))
pl = int(round(pw - 0.1))
pr = int(round(pw + 0.1))
im = cv2.resize(im0, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
im = cv2.copyMakeBorder(im, pt, pb, pl, pr, cv2.BORDER_CONSTANT, value=(114, 114, 114))
return im, (new_h, new_w), im0.shape, (pt, pb, pl, pr)
def per_process(self, img_path):
'''
前处理函数,读取图片,经过padding及归一化处理为模型的输入形式 B,C,H,W
:param img_path: 图片路径
:return: 输入数据(ndarray)
'''
im = cv2.imread(img_path)
im, img1_shape, im0_shape, pad = self.__resize_pic(im)
im = im[..., ::-1]
im = np.ascontiguousarray(im, dtype=np.float32)
im = np.transpose(im, (2, 0, 1))
im = im / 255
return im
def next_batch(self):
"""
读取一个batch的图像数据
"""
if self.batch_idx < self.max_batch_idx:
# ***********读取一个batch的文件**************#
batch_files = self.imgs[self.batch_idx * self.batch_size: (self.batch_idx + 1) * self.batch_size]
batch_imgs = np.zeros((self.batch_size, self.Channel, self.Height, self.Width), dtype=np.float32)
for i, f in enumerate(batch_files):
img = self.per_process(f)
# 判断字节是否与缓冲区对齐
assert (img.nbytes == self.data_size / self.batch_size), 'not valid img!' + f
batch_imgs[i] = img
self.batch_idx += 1
print("batch:[{}/{}]".format(self.batch_idx, self.max_batch_idx))
return np.ascontiguousarray(batch_imgs)
else:
return np.array([])
def get_batch_size(self):
"""
获取batch大小
"""
return self.batch_size
def get_batch(self, names, p_str=None):
"""
获取一个batch的图像数据,并拷贝到device内存中
"""
try:
batch_imgs = self.next_batch()
if batch_imgs.size == 0 or batch_imgs.size != self.batch_size * self.Channel * self.Height * self.Width:
return None
cuda.memcpy_htod(self.device_input, batch_imgs.astype(np.float32))
return [int(self.device_input)]
except Exception as e:
print("发生异常,异常为:{}".format(e))
return None
def read_calibration_cache(self):
"""
读取缓存数据
"""
# 如果存在校准集的缓存,则使用现有缓存,否则返回空值
if os.path.exists(self.cache_file):
print("succeed finding cache file:{}".format(self.cache_file))
with open(self.cache_file, "rb") as f:
return f.read()
else:
print("failed finding cache!")
return
def write_calibration_cache(self, cache):
"""
写入缓存数据
"""
with open(self.cache_file, "wb") as f:
f.write(cache)
print("succeed saving cache!")
import tensorrt as trt
import argparse
# 显式配置batch size
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
def ONNX2TRT(args, calib=None):
'''
:brief: convert onnx to tensorrt engine, use mode of ['fp16', 'int8']
:return: trt engine
'''
# 判断模式是否可用
assert args.mode.lower() in ['fp16', 'int8'], "mode should be in ['fp16', 'int8']"
G_LOGGER = trt.Logger(trt.Logger.WARNING)
with trt.Builder(G_LOGGER) as builder, \
builder.create_network(EXPLICIT_BATCH) as network, \
trt.OnnxParser(network, G_LOGGER) as parser, \
builder.create_builder_config() as config, \
trt.Runtime(G_LOGGER) as runtime:
# 配置tensorrt的推理缓冲区大小,即构建阶段可用的显存大小
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4096 * (1 << 30))
if args.mode.lower() == 'int8':
assert (builder.platform_has_fast_int8 == True), "not support int8"
# 配置int8量化所需的参数及校准集
config.set_flag(trt.BuilderFlag.INT8)
config.int8_calibrator = calib
elif args.mode.lower() == 'fp16':
assert (builder.platform_has_fast_fp16 == True), "not support fp16"
# 配置fp16模式下的参数
config.set_flag(trt.BuilderFlag.FP16)
# config.fp16
# 加载onnx模型,并解析
print('Loading ONNX file from path {}...'.format(args.onnx_file_path))
with open(args.onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()): # parser是tensorrt的onnx解析类,声明位置见20行
print("ERROR: Failed to parse the ONNX file.")
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
print(network.get_input(0).shape)
# network.get_input(0).shape = [1, 3, 640, 960]
print('Completed parsing of ONNX file')
# 构建序列化引擎文件
print('Building an engine from file {}; this may take a while...'.format(args.onnx_file_path))
# 根据配置及解析的网络构建序列化会话
engine = builder.build_serialized_network(network, config)
with open(args.engine_file_path, "wb") as f:
f.write(engine)
print("Completed creating Engine")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Pytorch2TensorRT args")
parser.add_argument("--batch_size", type=int, default=1, help='batch_size')
parser.add_argument("--channel", type=int, default=3, help='input channel')
parser.add_argument("--height", type=int, default=640, help='input height')
parser.add_argument("--width", type=int, default=640, help='input width')
parser.add_argument("--cache_file", type=str, default='YOLOX.cache', help='cache_file')
parser.add_argument("--mode", type=str, default='int8', help='fp16 or int8')
parser.add_argument("--onnx_file_path", type=str, default='yolov8s.onnx', help='onnx_file_path')
parser.add_argument("--engine_file_path", type=str, default='yolov8s_int8.engine', help='engine_file_path')
args = parser.parse_args()
calib = YOLOXEntropyCalibrator(args)
ONNX2TRT(args, calib)
标签:engine,python,onnx,self,cache,batch,im,np,size
From: https://blog.csdn.net/Meoyou/article/details/141052052