首页 > 编程札记 > 编程

RDD2022 数据格式转换与清洗

阅读：评论：0

RDD2022 数据格式转换与清洗

Rdd2022数据集，是关于道路损伤的数据集，与rdd2020相比增加了两万多张图片
但是由于格式不能被yolo直接使用且其中有大量的图片没有符合要求的标注，特写此文章记录数据清洗与格式化的过程

数据集下载

在开始前需要自己下载zip格式的RDD2022数据集，大小为12.4G 点击此处下载

之后，在桌面上新建一个名为my_file 的文件夹，将上面下载的压缩包放进去，将我的main.py放进去，文件夹结构如下

在pycharm种运行main文件即可
运行完成后my_file结构如下，其中的my_data就是你要的文件夹，其他的都没用：

注意事项

注意：

如果运行过程中出现任何bug使得程序没有进行到底，需要删除所有自动生成的文件，回到最开始的目录结构，从新开始运行main文件
如果报错说有缺了什么包，自己去安装即可
注意设置工作目录为你自己新建的那个my_file文件夹，一般情况下默认就是这个，如果报错找不到目录啥的就看下是不是这个问题
按照个人需求，以国家为单位对数据集进行了train:val=7:3的切割
特别注意：代码中将没有任何标注的图片直接剔除了，这可能会对你的训练产生影响

源代码

main.py的代码如下:

import zipfile
import os
import os
ElementTree as ET
from shutil import copyfile
import shutil
import argparse
from pathlib import Path
import random
from collections import defaultdict
import randomwork_dir = os.getcwd()
countries = ["China_Drone", "China_MotorBike", "Czech", "India", "Japan", "Norway", "United_States"]
labels = ["D00", "D10", "D20", "D40"]# 解压最开始的12.4G的压缩包到工作目录
#    解压之后是一个名为 RDD2022_all_countries 的文件夹
def unzip_rdd2022():path = os.path.join(work_dir, 'RDD2022.zip')zip_file = zipfile.ZipFile(path)zip_list = zip_file.namelist()for f in zip_list:act(f, work_dir)zip_file.close()# RDD2022_all_countries文件夹里面有6个以国家名称命名的压缩包
#    进入这个文件夹里面继续解压，注意是解压到了RDD2022_all_countries
#    这个文件夹里面，至此所有的压缩文件解压完毕
def unzip_RDD2022_all_countries():dir_path = os.path.join(work_dir, 'RDD2022_all_countries')all_countries_zip_file_name = os.listdir(dir_path)for name in all_countries_zip_file_name:print('正在解压{}'.format(name))all_countries_zip_file_path = os.path.join(dir_path, name)zip_file = zipfile.ZipFile(all_countries_zip_file_path)zip_list = zip_file.namelist()for f in zip_list:act(f, dir_path)zip_file.close()print('{}已解压完成'.format(name))# 将所有有标签的图片以及对应的标注移动到一个新的文件夹中
#   然后后续操作都是针对这些有标签的图片进行的，其实就是变相去除了
#   没有标签的图片
def remove_useless_file():# 一共6个国家，一个国家一个国家的操作RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries')for country in countries:print("开始对 {} 的标签与图片进行操作".format(country))annoFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/annotations/xmls/"))jpgFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/images/"))newCountry = "new_" + country# 在RDD2022_all_countries文件夹下面新建文件夹，new_countryname/Annotations#                                          new_countryname/JPEGImagesannotations_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'Annotations/')jpegimages_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'JPEGImages/')os.makedirs(annotations_dir, exist_ok=True)os.makedirs(jpegimages_dir, exist_ok=True)for annoFile in annoFiles:tree = ET.parse(os.path.join(RDD2022_all_countries_path + "/" + country + "/train/annotations/xmls/", annoFile))root = t()for obj in root.findall("object"):a = obj.find("name").textif a not in ve(obj)if len(root.findall("object")) > 0:country_path = os.path.join(RDD2022_all_countries_path, country)newCountry_path = os.path.join(RDD2022_all_countries_path, newCountry)tree.write(newCountry_path + "/Annotations/" + annoFile)copyfile(os.path.join(country_path + "/train/images/", annoFile.split(".")[0]) + ".jpg",newCountry_path + "/JPEGImages/" + annoFile.split(".")[0] + ".jpg")else:# print(f'{annoFile} 没有标签文件')continueprint("{} 的标签与图片操作完毕".format(country))# 将所有的图片复制到工作目录下的new_train/jpegimages 文件夹下
# 将所有的标签复制到工作目录下的new_train/annotations 文件夹下def copy_file_2_new_train_dir():# 首先创建new_train文件夹os.makedirs(work_dir + "new_train/", exist_ok=True)# 创建new_train文件夹下面的两个文件夹jpeg_path = os.path.join(work_dir, 'new_train', 'jpegimages/')annotation_path = os.path.join(work_dir, 'new_train', 'annotations/')os.makedirs(jpeg_path, exist_ok=True)os.makedirs(annotation_path, exist_ok=True)RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries')for country in countries:print("{}正在复制".format(country))jpeg_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'JPEGImages')all_jpeg_names = os.listdir(jpeg_dir_path)annotation_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'Annotations')all_anno_names = os.listdir(annotation_dir_path)for name in all_jpeg_names:source = os.path.join(jpeg_dir_path, name)target = os.path.join(work_dir, 'new_train', 'jpegimages')py(source, target)for name in all_anno_names:source = os.path.join(annotation_dir_path, name)target = os.path.join(work_dir, 'new_train', 'annotations')py(source, target)print("{}复制完毕".format(country))# 生成一个包含所有xml文件路径的txt文件以便 xml2yolo文件调用
def generate_txt_file():annoFiles = os.listdir(os.path.join(work_dir, "new_train/Annotations/"))yoloFile = open("./", "w")for i in range(len(annoFiles)):yoloFile.writelines(work_dir + "/new_train/Annotations/" + annoFiles[i] + "n")yoloFile.close()def xml2yolo():import argparseimport ElementTree as ETfrom PIL import Imagefrom collections import defaultdict# Type of image in DatasetimageType = ["jpeg", "png", "jpg", "JPEG", "JPG", "PNG"]# dictionary to store list of image paths in each classimageListDict = defaultdict(set)def convert(size, box):dw = 1. / size[0]dh = 1. / size[1]x = (box[0] + box[1]) / 2.0y = (box[2] + box[3]) / 2.0w = box[1] - box[0]h = box[3] - box[2]x = x * dww = w * dwy = y * dhh = h * dhreturn [x, y, w, h]# convert minX,minY,maxX,maxY to normalized numbers required by Yolodef getYoloNumbers(imagePath, minX, minY, maxX, maxY):image = Image.open(imagePath)w = int(image.size[0])h = int(image.size[1])b = (minX, maxX, minY, maxY)bb = convert((w, h), b)image.close()return bbdef getFileList3(filePath):xmlFiles = []with open(filePath, "r") as f:xmlFiles = f.readlines()for i in range(len(xmlFiles)):temp = xmlFiles[i].strip().rsplit('.', 1)[0]xmlFiles[i] = os.path.place("JPEGImages", "Annotations") + ".xml")labels_path = os.path.dirname(xmlFiles[i]).replace("Annotations", "labels")if not ists(labels_path):os.mkdir(labels_path)assert (ists(xmlFiles[i]))return xmlFilesdef main():parser = argparse.ArgumentParser(description='run phase2.')parser.add_argument('--input-file', type=str,help='location to the list of images/xml files(absolute path). sample file at "./"',default='./')args = parser.parse_args()# assign each class of dataset to a numberoutputCtoId = {'D00': 0, 'D10': 1, 'D20': 2, 'D40': 3}# read the path of the directory where XML and images are presentxmlFiles = getFileList3(args.input_file)print("total files:", len(xmlFiles))print('正在转换......')# loop over each file under dirPathfor file in xmlFiles:filePath = file# print(filePath)tree = ET.parse(filePath)root = t()i = 0imageFile = filePath[:-4].replace("Annotations", "JPEGImages") + "." + imageType[i]while (not os.path.isfile(imageFile) and i < 2):i += 1imageFile = filePath[:-4].replace("Annotations", "JPEGImages") + "." + imageType[i]if not os.path.isfile(imageFile):print("File not found:", imageFile)continuetxtFile = filePath[:-4].replace("Annotations", "labels") + ".txt"yoloOutput = open(txtFile, "w")# loop over each object tag in annotation tagfor objects in root.findall('object'):surfaceType = objects.find('name').place(" ", "")if surfaceType == "D30":continuebndbox = objects.find('bndbox')[minX, minY, maxX, maxY] = [int()) for child in bndbox][x, y, w, h] = getYoloNumbers(imageFile, int(minX), int(minY), int(maxX), int(maxY))yoloOutput.write(str(outputCtoId[surfaceType]) + " " + str(x) + " " + str(y) + " " + str(w) + " " + str(h) + "n")imageListDict[outputCtoId[surfaceType]].add(imageFile)yoloOutput.close()for cl in imageListDict:print(labels[cl], ":", len(imageListDict[cl]))main()def generate_my_data():# 首先在工作目录下创建 my_data文件夹，以及他下面的images文件夹,labels文件夹os.makedirs(work_dir + 'my_data/', exist_ok=True)images_path = os.path.join(work_dir, 'my_data', 'images/')labels_path = os.path.join(work_dir, 'my_data', 'labels/')os.makedirs(images_path, exist_ok=True)os.makedirs(labels_path, exist_ok=True)# images和labels文件夹下面各有一个train，val文件夹os.makedirs(os.path.join(images_path, 'train/'), exist_ok=True)os.makedirs(os.path.join(images_path, 'val/'), exist_ok=True)os.makedirs(os.path.join(labels_path, 'train/'), exist_ok=True)os.makedirs(os.path.join(labels_path, 'val/'), exist_ok=True)print("最终my_data文件夹基本结构创建完毕")# 将new_train中6个国家的图片的绝对路径分别放到6个列表中new_train_path = os.path.join(work_dir, 'new_train')jpeg_dir_path = os.path.join(new_train_path, 'jpegimages')labels_dir_path = os.path.join(new_train_path, 'labels')all_images_name = os.listdir(jpeg_dir_path)all_countries_images = defaultdict(lambda: [])for name in all_images_name:country_name = '_'.join(name.split('_')[:-1])all_countries_images[country_name].append(name)images_len = sum([len(i) for i in all_countries_images.values()])print("一共有{}张图片".format(images_len))for k, v in all_countries_images.items():print("{} 一共有 {}张图片".format(k, len(v)))print('*************************')print("开始切分数据集")for country in countries:image_len = len(all_countries_images[country])train_nums = int(image_len * 0.7)val_nums = image_len - train_numsprint("{}一共{}张图片，训练集7/10一共是{}张，测试集3/10一共是{}张，正在切割".format(country, image_len, train_nums, val_nums))# 验证集一共val_nums张图片, 一共image_len张图片，索引  0~image_len-1 ,从里面抽取val_index个数val_index = random.sample(range(0, image_len), val_nums)for idx, name in enumerate(all_countries_images[country]):# 图片的复制source = os.path.join(jpeg_dir_path, name)# target有两种可能，一种是train,一种是valtarget = os.path.join(images_path, 'train') if idx not in val_index else os.path.join(images_path, 'val')py(source, target)# 图片对应的label的复制#    label的名称就是图片的名称改掉后缀label_name = name.split('.')[0] + '.txt'label_source = os.path.join(work_dir, 'new_train', 'labels', label_name)label_target = os.path.join(work_dir, 'my_data', 'labels','train') if idx not in val_index else os.path.join(work_dir, 'my_data','labels', 'val')py(label_source, label_target)all_train_len = len(os.listdir(os.path.join(work_dir, 'my_data', 'images', 'train')))all_val_len = len(os.listdir(os.path.join(work_dir, 'my_data', 'images', 'val')))print("所有数据切分完毕,训练集一共{}条，验证集一共{}条".format(all_train_len, all_val_len))print("nnnn*************************")print("完成，目标文件夹就是my_data, 其他的文件都可以删除")print("注意：一共4种损伤类型，4种类型的名称以及对应的编号为")for idx, i in enumerate(labels):print("{}: {}".format(i, idx))if __name__ == '__main__':print("正在解压12.4G大的最外面的压缩包")unzip_rdd2022()print("正在解压6个国家的压缩包")unzip_RDD2022_all_countries()print("对图片进行去除清洗操作")remove_useless_file()print("正在将所有的图片以及标签复制到统一的目录下")copy_file_2_new_train_dir()print("正在生成用于标注转换的txt文件")generate_txt_file()print("正在转换标签")xml2yolo()print('正在生成最终文件夹')generate_my_data()

本文发布于:2024-01-28 04:41:28，感谢您对本站的认可！

本文链接：https://www.4u4v.net/it/17063880924852.html

上一篇：Spark

下一篇：c语言计算磁盘大小,通过累积器获取RDD磁盘大小并计算分区数量

标签：格式转换数据

留言与评论（共有 0 条评论）