首页 > 其他分享 >coco2017 Dataset EDA

coco2017 Dataset EDA

时间:2022-09-20 09:56:34浏览次数:102  
标签:plot EDA self Dataset num cat2objs coco coco2017 distribution

Github仓库:gy-7/coco_EDA (github.com)

对coco数据集的分析,近期忙着写论文,空余时间很少能写博文了。

EDA的代码放在结尾了,Github仓库里也有。仓库里还有其他的一些EDA分析,不定时更新。

训练集所有类别的数量分布情况:

coco_train_class_distribution

训练集所有类别的尺寸分布情况:

coco_train_size_distribution

验证集所有类别的数量分布情况:

coco_val_class_distribution

验证集所有类别的尺寸分布情况:

coco_val_size_distribution

EDA代码:

import os
import seaborn as sns
import pycocotools.coco
import matplotlib.pyplot as plt

root_dir = os.getcwd()
train_ann_fp = os.path.join(root_dir, 'annotations', 'instances_train2017.json')
val_ann_fp = os.path.join(root_dir, 'annotations', 'instances_val2017.json')


class COCO_EDA:
    def __init__(self, json_file, type='train'):
        self.COCO_SMALL_SCALE = 32
        self.COCO_MEDIUM_SCALE = 96

        self.json_file = json_file
        coco = pycocotools.coco.COCO(json_file)

        self.type = type
        self.imgs = coco.dataset['images']
        self.anns = coco.dataset['annotations']
        self.cats = coco.dataset['categories']
        self.img_ids = coco.getImgIds()
        self.ann_ids = coco.getAnnIds()
        self.cat_ids = coco.getCatIds()

        self.cat2imgs = coco.catToImgs
        self.img2anns = coco.imgToAnns

        self.imgs_num = len(self.imgs)
        self.objs_num = len(self.anns)

        # data to be collected
        self.small_objs_num = 0
        self.medium_objss_num = 0
        self.large_objss_num = 0

        self.small_objs = []
        self.medium_objs = []
        self.large_objs = []

        self.cat2objs = {}
        self.small_cat2objs = {}  # small objects classes distribution
        self.medium_cat2objs = {}  # medium objects classes distribution
        self.large_cat2objs = {}  # large objects classes distribution
        self.cat2objs_num = {}  # objects classes distribution
        self.small_cat2objs_num = {}  # small objects classes distribution
        self.medium_cat2objs_num = {}  # medium objects classes distribution
        self.large_cat2objs_num = {}  # large objects classes distribution

        # plot use data
        self.catid2name = {}  # 用于绘图中显示类别名字
        self.cats_plot = []  # coco 所有尺寸目标的类别分布
        self.small_cats_plot = []  # 小目标中每个类的分布情况
        self.medium_cats_plot = []  # 中目标中每个类的分布情况
        self.large_cats_plot = []  # 大目标中每个类的分布情况

        # 每个类的小,中,大目标的数量
        self.size_distribution = {}


def collect_data(coco):
    # collect small, medium, large objects
    for ann in coco.anns:
        if ann['area'] < coco.COCO_SMALL_SCALE ** 2:
            coco.small_objs_num += 1
            coco.small_objs.append(ann)
        elif ann['area'] < coco.COCO_MEDIUM_SCALE ** 2:
            coco.medium_objs.append(ann)
            coco.medium_objss_num += 1
        else:
            coco.large_objs.append(ann)
            coco.large_objss_num += 1

    for i in coco.cat_ids:
        coco.cat2objs[i] = []
        coco.small_cat2objs[i] = []
        coco.medium_cat2objs[i] = []
        coco.large_cat2objs[i] = []
        coco.cat2objs_num[i] = 0
        coco.small_cat2objs_num[i] = 0
        coco.medium_cat2objs_num[i] = 0
        coco.large_cat2objs_num[i] = 0
        coco.size_distribution[i] = []

    for i in coco.cats:
        coco.catid2name[i['id']] = i['name']

    # collect small, medium, large class distribution
    for i in coco.anns:
        coco.cat2objs[i['category_id']].append(i)
        coco.cat2objs_num[i['category_id']] += 1
        coco.cats_plot.append(coco.catid2name[i['category_id']])
        if i['area'] < coco.COCO_SMALL_SCALE ** 2:
            coco.small_cat2objs[i['category_id']].append(i)
            coco.small_cat2objs_num[i['category_id']] += 1
            coco.small_cats_plot.append(coco.catid2name[i['category_id']])
            coco.size_distribution[i['category_id']].append('s')
        elif i['area'] < coco.COCO_MEDIUM_SCALE ** 2:
            coco.medium_cat2objs[i['category_id']].append(i)
            coco.medium_cat2objs_num[i['category_id']] += 1
            coco.medium_cats_plot.append(coco.catid2name[i['category_id']])
            coco.size_distribution[i['category_id']].append('m')
        else:
            coco.large_cat2objs[i['category_id']].append(i)
            coco.large_cat2objs_num[i['category_id']] += 1
            coco.large_cats_plot.append(coco.catid2name[i['category_id']])
            coco.size_distribution[i['category_id']].append('l')

    assert len(coco.small_objs) == coco.small_objs_num == sum(coco.small_cat2objs_num.values())
    assert len(coco.medium_objs) == coco.medium_objss_num == sum(coco.medium_cat2objs_num.values())
    assert len(coco.large_objs) == coco.large_objss_num == sum(coco.large_cat2objs_num.values())
    assert len(coco.anns) == coco.objs_num == sum(coco.cat2objs_num.values())


def plot_coco_class_distribution(plot_data, plot_order, save_fp, plot_title, plot_y_heigh,
                                 plot_y_heigh_residual=[1800, 100]):
    # 绘制coco数据集的类别分布
    sns.set_style("whitegrid")
    plt.figure(figsize=(15, 8))  # 图片的宽和高,单位为inch
    plt.title(plot_title, fontsize=9)  # 标题
    plt.xlabel('class', fontsize=8)  # x轴名称
    plt.ylabel('counts', fontsize=8)  # y轴名称
    plt.xticks(rotation=90, fontsize=8)  # x轴标签竖着显示
    plt.yticks(fontsize=8)
    for x, y in enumerate(plot_y_heigh):
        if 'train' in save_fp:
            plt.text(x, y + plot_y_heigh_residual[0], '%s' % y, ha='center', fontsize=7, rotation=90)
        else:
            plt.text(x, y + plot_y_heigh_residual[1], '%s' % y, ha='center', fontsize=7, rotation=90)
    ax = sns.countplot(x=plot_data, palette="PuBu_r", order=plot_order)  # 绘制直方图,palette调色板,蓝色由浅到深渐变。
    # palette样式:https://blog.csdn.net/panlb1990/article/details/103851983
    plt.savefig(os.path.join(save_fp), dpi=500)
    plt.show()


def plot_size_distribution(plot_data, save_fp, plot_title, plot_order=['s', 'm', 'l']):
    sns.set_style("whitegrid")
    plt.figure(figsize=(21, 35))  # 图片的宽和高,单位为inch
    plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=1, hspace=1.5)  # 调整子图间距

    for idx, size_data in enumerate(plot_data.values()):
        plt.subplot(10, 8, idx + 1)
        plt.xticks(rotation=0, fontsize=18)  # x轴标签竖着显示
        plt.yticks(fontsize=18)
        plt.xlabel('size', fontsize=20)  # x轴名称
        plt.ylabel('count', fontsize=20)  # y轴名称
        plt.title(plot_title[idx], fontsize=24)  # 标题
        sns.countplot(x=size_data, palette="PuBu_r", order=plot_order)  # 绘制直方图,palette调色板,蓝色由浅到深渐变。

    plt.savefig(save_fp, dpi=500, pad_inches=0)
    plt.show()


def run_plot_coco_class_distribution(coco, save_dir):
    # # 绘制coco数据集的类别分布
    plot_order = [i for i in coco.catid2name.values()]

    plot_heigh = [i for i in coco.cat2objs_num.values()]
    save_fp = os.path.join(save_dir, f'coco_{coco.type}_class_distribution.png')
    plot_coco_class_distribution(coco.cats_plot, plot_order, save_fp, 'COCO train2017 class distribution', plot_heigh,
                                 plot_y_heigh_residual=[1800, 100])

    plot_heigh = [i for i in coco.small_cat2objs_num.values()]
    save_fp = os.path.join(save_dir, f'coco_{coco.type}_small_class_distribution.png')
    plot_coco_class_distribution(coco.small_cats_plot, plot_order, save_fp, 'COCO train2017 small class distribution',
                                 plot_heigh,
                                 plot_y_heigh_residual=[900, 50])

    plot_heigh = [i for i in coco.medium_cat2objs_num.values()]
    save_fp = os.path.join(save_dir, f'coco_{coco.type}_medium_class_distribution.png')
    plot_coco_class_distribution(coco.medium_cats_plot, plot_order, save_fp, 'COCO train2017 medium class distribution',
                                 plot_heigh, plot_y_heigh_residual=[900, 50])

    plot_heigh = [i for i in coco.large_cat2objs_num.values()]
    save_fp = os.path.join(save_dir, f'coco_{coco.type}_large_class_distribution.png')
    plot_coco_class_distribution(coco.large_cats_plot, plot_order, save_fp, 'COCO train2017 large class distribution',
                                 plot_heigh,
                                 plot_y_heigh_residual=[900, 50])


def run_plot_coco_size_distribution(coco, save_dir):
    # 绘制coco数据集各类别的尺寸分布
    plot_order = [i for i in coco.catid2name.values()]
    save_fp = os.path.join(save_dir, f'coco_{coco.type}_size_distribution.png')
    plot_size_distribution(coco.size_distribution, save_fp, plot_order)


if __name__ == '__main__':
    print("analyze coco train dataset...")
    print("-" * 50)
    coco_train = COCO_EDA(train_ann_fp, type='train')
    collect_data(coco_train)
    print("coco train images num:", coco_train.imgs_num)
    print("coco train objects num:", coco_train.objs_num)
    print("coco small objects num:", coco_train.small_objs_num)
    print("coco medium objects num:", coco_train.medium_objss_num)
    print("coco large objects num:", coco_train.large_objss_num)
    print("coco small objects percent:", coco_train.small_objs_num / coco_train.objs_num)
    print("coco medium objects percent:", coco_train.medium_objss_num / coco_train.objs_num)
    print("coco large objects percent:", coco_train.large_objss_num / coco_train.objs_num)
    run_plot_coco_class_distribution(coco_train, ".\\EDA")
    run_plot_coco_size_distribution(coco_train, ".\\EDA")
    print("-" * 50)
    print()

    print("analyze coco val dataset...")
    print("-" * 50)
    coco_val = COCO_EDA(val_ann_fp, type='val')
    collect_data(coco_val)
    print("coco val images num:", coco_val.imgs_num)
    print("coco val objects num:", coco_val.objs_num)
    print("coco small objects num:", coco_val.small_objs_num)
    print("coco medium objects num:", coco_val.medium_objss_num)
    print("coco large objects num:", coco_val.large_objss_num)
    print("coco small objects percent:", coco_val.small_objs_num / coco_val.objs_num)
    print("coco medium objects percent:", coco_val.medium_objss_num / coco_val.objs_num)
    print("coco large objects percent:", coco_val.large_objss_num / coco_val.objs_num)
    run_plot_coco_class_distribution(coco_val, ".\\EDA")
    run_plot_coco_size_distribution(coco_val, ".\\EDA")
    print("-" * 50)

标签:plot,EDA,self,Dataset,num,cat2objs,coco,coco2017,distribution
From: https://www.cnblogs.com/gy77/p/16709989.html

相关文章

  • E10——Sharedata 使用
         通过ActiveObject.RL_[关联实体名].字段名来获取访问界面上的实体关联的字段  如下例子,大概就是通过已经设好的ShareData关系,通过关系名称取到对应......
  • 记录 javascript canvas ImageData 解析
    数组的内容:data.length:w*h*4r:0-255g:0-255b:0-255a:0-255Math.round(255*a) 数组遍历:constdata=context.getI......
  • EDA 自动化库:SpeedML
    EDA自动化库:SpeedML必须知道的库之一,才能拥有深刻的EDA!在数据科学领域,我们知道探索性数据分析或EDA是最重要和最耗时的部分,并且要拥有可用于模型的数据,我们必须花费......
  • Can't find resource for bundle java.util.PropertyResourceBundle, key product.ord
    读取的key与properties文件中的key不一致,大概率是写错了如果这个key用了,他就点亮,输入错误,或者没用到就是灰色......
  • 使用 pd.cut 将数值数据转换为 EDA 的分类数据
    使用pd.cut将数值数据转换为EDA的分类数据Photoby由Illyes设计on不飞溅在某些情况下,我们需要在探索性数据分析(EDA)的预处理中将数值数据转换为分类数据......
  • Dataset与DataFrame创建的比较方式
    DataFrameDataset创建方式1.根据集合或者RDD的隐式函数toDF(列名)创建(需要引入SparkSession的隐式转换函数)2.SparkSession的createDataFrame函数3.外部结构化......
  • 通过createDataset创建Dataset数据集(Array,RDD,List)
    说明定义:底层用到了函数的柯里化,需要传递两个值。第二个值是一个隐式参数,需要定义一个隐式变量给隐式参数传递值。隐式变量不需要我们定义在SparkSession中全部给......
  • 通过toDS()方法创建Dataset
    objectCreateDatasetByToDs{defmain(args:Array[String]):Unit={valconf=newSparkConf().setAppName("demo01").setMaster("local[*]")valsessio......
  • echarts-dataset数据源配置项
     如下效果图:   代码入下:letbox4=document.querySelector('.box4')letmyCharts3=echarts.init(box4)myCharts3.setOption({......
  • Pytorch——Dataset&Dataloader
    在利用Pytorch进行深度学习的训练时需要将数据进行打包,这就是Dataset与Dataloader的作用。 Dataset将数据进行包装,Dataloader迭代包装好的数据并输出每次训练所......