yolov7的loss计算代码详解

阅读：评论：0
yolov7的loss计算代码详解
yolov7最重要的创新就是其网络设计和正负样本分配策略，下面的代码注释解释了yolov7的样本分配策略，帮助理解
class ComputeLossOTA:# Compute lossesdef __init__(self, model, autobalance=False):super(ComputeLossOTA, self).__init__()device = next(model.parameters()).device  # get model deviceh = model.hyp  # hyperparameters# Define criteriaBCEcls = nn.BCEWithLogitsLoss(pos_weight&#sor([h['cls_pw']], device=device))BCEobj = nn.BCEWithLogitsLoss(pos_weight&#sor([h['obj_pw']], device=device))# Class label smoothing .04103.pdf eqn 3self.cp, self = smooth_BCE(eps&#('label_smoothing', 0.0))  # positive, negative BCE targets# Focal lossg = h['fl_gamma']  # focal loss gammaif g > 0:BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)det = del[-1] if is_parallel(model) del[-1]  # Detect() moduleself.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.06, .02])  # P3-P7self.ssi = list(det.stride).index(16) if autobalance else 0  # stride 16 indexself.BCEcls, self.BCEobj, , self.hyp, self.autobalance = BCEcls, BCEobj, , h, autobalancefor k in 'na', 'nc', 'nl', 'anchors', 'stride':setattr(self, k, getattr(det, k))def __call__(self, p, targets, imgs):  # predictions, targets, model   device = targets.devicelcls, lbox, lobj = s(1, device=device), s(1, device=device), s(1, device=device) #初始化0,0,0bs, as_, gjs, gis, targets, anchors = self.build_targets(p, targets, imgs) # 返回匹配到的image index, anchor index, gj, gi, GT, anchorpre_gen_gains = [sor(pp.shape, device=device)[[3, 2, 3, 2]] for pp in p] # [[80,80,80,80], [40,40,40,40], [20,20,20,20]]# Lossesfor i, pi in enumerate(p):  # layer index, layer predictionsb, a, gj, gi = bs[i], as_[i], gjs[i], gis[i]  # image, anchor, gridy, gridxtobj = s_like(pi[..., 0], device=device)  # target obj, pi[..., 0].shape=(1, 3, 80, 80), tobj的shape为(1, 3, 80, 80)值为0n = b.shape[0]  # number of targetsif n:ps = pi[b, a, gj, gi]  # prediction subset corresponding to targets, shape为(MP, num class+5),MP为匹配到GT框的正样本# Regressiongrid = torch.stack([gi, gj], dim=1)pxy = ps[:, :2].sigmoid() * 2. - 0.5#pxy = ps[:, :2].sigmoid() * 3. - 1.pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * anchors[i]pbox = torch.cat((pxy, pwh), 1)  # predicted boxselected_tbox = targets[i][:, 2:6] * pre_gen_gains[i] # targets[i].shape=(MP, 6),MP为匹配到GT框的正样本selected_tbox[:, :2] -= gridiou = bbox_iou(pbox.T, selected_tbox, x1y1x2y2=False, CIoU=True)  # iou(prediction, target)lbox += (1.0 - iou).mean()  # iou loss# Objectnesstobj[b, a, gj, gi] = (1.0 - ) +  * iou.detach().clamp(0).type(tobj.dtype)  # iou ratio, tobj[b, a, gj, gi].shape=(MP)# Classificationselected_tcls = targets[i][:, 1].long() # targets[i].shape=(MP, 6),MP为匹配到GT框的正样本 > 1:  # cls loss (only if multiple classes)t = torch.full_like(ps[:, 5:], self, device=device)  # targets, ps[:, 5:].shape=(MP, num class+5), self=0, t.shape=(MP, num class+5)t[range(n), selected_tcls] = self.cp # self.cp=1, t.shape=(MP, num class+5), t即为smooth_BCE的权重lcls += self.BCEcls(ps[:, 5:], t)  # BCE# Append targets to text file# with open(&#', 'a') as file:#     [file.write('%11.5g ' * 4 % tuple(x) + 'n') for x in torch.cat((txy[i], twh[i]), 1)]obji = self.BCEobj(pi[..., 4], tobj)lobj += obji * self.balance[i]  # obj lossif self.autobalance:self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item()if self.autobalance:self.balance = [x / self.balance[self.ssi] for x in self.balance]lbox *= self.hyp['box']lobj *= self.hyp['obj']lcls *= self.hyp['cls']bs = tobj.shape[0]  # batch sizeloss = lbox + lobj + lclsreturn loss * bs, torch.cat((lbox, lobj, lcls, loss)).detach()def build_targets(self, p, targets, imgs):# p为模型输出的预测值，有3层，size=[(bs,3,20,20,85),(bs,3,40,40,85),(bs,3,80,80,85)], targets的size=(nt, 6)=(GT数量，6)=(nt, index c x y w h)#indices, anch = self.find_positive(p, targets)indices, anch = self.find_3_positive(p, targets) #compute_loss()的build_targets，找到正样本, indices[i]=(image, anchors index, grid indices)#indices, anch = self.find_4_positive(p, targets)#indices, anch = self.find_5_positive(p, targets)#indices, anch = self.find_9_positive(p, targets)matching_bs = [[] for pp in p]matching_as = [[] for pp in p]matching_gjs = [[] for pp in p]matching_gis = [[] for pp in p]matching_targets = [[] for pp in p]matching_anchs = [[] for pp in p]nl = len(p)    for batch_idx in range(p[0].shape[0]): #对每一张图片进行匹配b_idx = targets[:, 0]==batch_idx # 找到batch_idx的目标this_target = targets[b_idx] #取出该目标 this_target.shape=(标注的GT数量，6)=(nt, index c x y w h)if this_target.shape[0] == 0: #目标数量是否为0continuetxywh = this_target[:, 2:6] * imgs[batch_idx].shape[1] # this_target*wh，还原GT框, shape=(nt, 4) txyxy = xywh2xyxy(txywh) #转换成(top left xy, bottem right xy), shape=(nt,4)pxyxys = []p_cls = []p_obj = []from_which_layer = []all_b = []all_a = []all_gj = []all_gi = []all_anch = []for i, pi in enumerate(p): #p size=[(bs,3,20,20,85),(bs,3,40,40,85),(bs,3,80,80,85)]b, a, gj, gi = indices[i] # indices[i]=(image, anchors index, grid indices), 当前层匹配的正样本，image index, anchors index, gj, giidx = (b == batch_idx) # 得到当前图片id的所有正样本idb, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx] #得到当前图片id的所有正样本         all_b.append(b) #image indexall_a.append(a) #anchors indexall_gj.append(gj) #gjall_gi.append(gi) #giall_anch.append(anch[i][idx]) #anchorsfrom_which_layer.s(size=(len(b),)) * i) # s(size=(len(b),)) * i=[i,i,i,i,i]fg_pred = pi[b, a, gj, gi]    # 得到预测值, shape=(len(b), 85)           p_obj.append(fg_pred[:, 4:5]) # obj预测值p_cls.append(fg_pred[:, 5:]) # cls预测值grid = torch.stack([gi, gj], dim=1) # [gi, gj], shape=(len(b), 85)pxy = (fg_pred[:, :2].sigmoid() * 2. - 0.5 + grid) * self.stride[i] #/ 8. 预测的xy值#pxy = (fg_pred[:, :2].sigmoid() * 3. - 1. + grid) * self.stride[i]pwh = (fg_pred[:, 2:4].sigmoid() * 2) ** 2 * anch[i][idx] * self.stride[i] #/ 8. 预测的wh值pxywh = torch.cat([pxy, pwh], dim=-1) # [x,y,w,h], shape=(len(b), 4)pxyxy = xywh2xyxy(pxywh) # [x,y,x,y], shape=(len(b), 4)pxyxys.append(pxyxy)pxyxys = torch.cat(pxyxys, dim=0) # 将本张图片的所有正样本合并, shape=(P, 4)if pxyxys.shape[0] == 0:continuep_obj = torch.cat(p_obj, dim=0) #shape=(P, 1)p_cls = torch.cat(p_cls, dim=0) #shape=(P, 85)from_which_layer = torch.cat(from_which_layer, dim=0) #shape=(P)all_b = torch.cat(all_b, dim=0) #shape=(P)all_a = torch.cat(all_a, dim=0) #shape=(P)all_gj = torch.cat(all_gj, dim=0) #shape=(P)all_gi = torch.cat(all_gi, dim=0) #shape=(P)all_anch = torch.cat(all_anch, dim=0) #shape=(P, 2)pair_wise_iou = box_iou(txyxy, pxyxys) # 计算GT与预测正样本框的iou矩阵, shape=(nt, P)pair_wise_iou_loss = -torch.log(pair_wise_iou + 1e-8) # 计算GT与预测正样本框的iou矩阵loss, shape=(nt, P)top_k, _ = pk(pair_wise_iou, min(10, pair_wise_iou.shape[1]), dim=1) # pk默认从大到小进行排序，找到前k个数, shape=(nt,10) 即每个gtbox都取自己排名前10的IoUdynamic_ks = torch.clamp(top_k.sum(1).int(), min=1) # top_k.sum(1).int()对前10个iou相加并取整，每一个表示GT需要取dynamic_ks的正样本，shape=[nt], clamp是区间函数，每一个目标保证必须有一个正样本，因此不能小于1gt_cls_per_image = (F.one_hot(this_target[:, 1].to(torch.int64), ) # this_target[:, 1].shape=[nt], 即所有GT的类别c&#_hot(this_target[:, 1].to(torch.int64), ).shape=(nt, 85),对类别做one hot.float().unsqueeze(1).repeat(1, pxyxys.shape[0], 1) # pxyxys.shape=(P, 4)) # 对每一个GT的label做one hot，然后重复P次，shape=(nt, P, 85)num_gt = this_target.shape[0] # GT的数量 this_target.shape=(标注的GT数量，6)=(nt, index c x y w h)cls_preds_ = (p_cls.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()* p_obj.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_())# p_cls.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_().shape=(nt, P, 85), p_obj.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_().shape=(nt, P, 1)y = cls_preds_.sqrt_()pair_wise_cls_loss = F.binary_cross_entropy_with_logits(torch.log(y/(1-y)) , gt_cls_per_image, reduction="none").sum(-1) # shape=(nt, P)del cls_preds_cost = (pair_wise_cls_loss+ 3.0 * pair_wise_iou_loss) # shape=(nt, P)matching_matrix = s_like(cost) # s_like函数基本功能是根据给定张量，生成与其形状相同的全0张量, shape=(nt, P)for gt_idx in range(num_gt):_, pos_idx = pk(cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False) # 取cost[gt_idx]矩阵的前dynamic_k个位置下标matching_matrix[gt_idx][pos_idx] = 1.0 #将动态下标置1.del top_k, dynamic_ksanchor_matching_gt = matching_matrix.sum(0) #shape=(P)，对所有正样本匹配的GT框个数相加,假设matching_matrix=[[1,0,0,1], [0,0,0,1]] ,(nt=2, P=4), matching_matrix.sum(0)=[1,0,0,2]if (anchor_matching_gt > 1).sum() > 0: #如果大于0, 则认为一个正样本匹配多个GT框_, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0) #如果同一个正样本匹配到的GT数量大于1,则比较多个GT框谁的cost小就作为正样本，其他的舍去, cost_argmin为经过筛选的下标，假设为1matching_matrix[:, anchor_matching_gt > 1] *= 0.0 #将大于1的那一列的所有数先全变为0， anchor_matching_gt大于1的GT置0， matching_matrix=[[1,0,0,0],[0,0,0,0]]matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0 #将cost最小的位置变为1, anchor_matching_gt大于1的GT置1,matching_matrix=[[1,0,0,0],[0,0,0,1]]fg_mask_inboxes = matching_matrix.sum(0) > 0.0 # 查找匹配到的正样本的id, shape=(P)， fg_mask_inboxes=[True, False, False, True]matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0) #先取fg_mask_inboxes中为True的每一列，总共有fg_mask_inboxes.shape列，然后对每列求argmax，matching_matrix[:, fg_mask_inboxes]=[[1,0],[0,1]], 结果为每个正样本对应的真实框的索引=[0,1]from_which_layer = from_which_layer[fg_mask_inboxes] # 每个匹配到的正样本是从哪一层来的,from_which_layer选择匹配到的正样本对应的layer id, shape=(mP),mP为匹配到GT框的正样本all_b = all_b[fg_mask_inboxes] # all_b为匹配到的正样本对应的image index, shape=(mP),mP为匹配到GT框的正样本all_a = all_a[fg_mask_inboxes] # all_a为匹配到的正样本对应的anchor index, shape=(mP),mP为匹配到GT框的正样本all_gj = all_gj[fg_mask_inboxes] # all_gj为匹配到的正样本对应的gj, shape=(mP),mP为匹配到GT框的正样本all_gi = all_gi[fg_mask_inboxes] # all_gi为匹配到的正样本对应的gi, shape=(mP),mP为匹配到GT框的正样本all_anch = all_anch[fg_mask_inboxes] # all_gi为匹配到的正样本对应的anchor, shape=(mP, 2),mP为匹配到GT框的正样本this_target = this_target[matched_gt_inds] # 选择匹配到正样本的GT,, shape=(mP, 6),mP为匹配到GT框的正样本for i in range(nl): #找到每一层的正样本layer_idx = from_which_layer == imatching_bs[i].append(all_b[layer_idx])matching_as[i].append(all_a[layer_idx])matching_gjs[i].append(all_gj[layer_idx])matching_gis[i].append(all_gi[layer_idx])matching_targets[i].append(this_target[layer_idx])matching_anchs[i].append(all_anch[layer_idx])for i in range(nl):# 合并所有层的结果if matching_targets[i] != []:matching_bs[i] = torch.cat(matching_bs[i], dim=0)matching_as[i] = torch.cat(matching_as[i], dim=0)matching_gjs[i] = torch.cat(matching_gjs[i], dim=0)matching_gis[i] = torch.cat(matching_gis[i], dim=0)matching_targets[i] = torch.cat(matching_targets[i], dim=0)matching_anchs[i] = torch.cat(matching_anchs[i], dim=0)else:matching_bs[i] = sor([], device='cuda:0', dtype=torch.int64)matching_as[i] = sor([], device='cuda:0', dtype=torch.int64)matching_gjs[i] = sor([], device='cuda:0', dtype=torch.int64)matching_gis[i] = sor([], device='cuda:0', dtype=torch.int64)matching_targets[i] = sor([], device='cuda:0', dtype=torch.int64)matching_anchs[i] = sor([], device='cuda:0', dtype=torch.int64)return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchs           def find_3_positive(self, p, targets): # p为模型输出的预测值，有3层，size=[(bs,3,20,20,85),(bs,3,40,40,85),(bs,3,80,80,85)], targets的size=(nt, 6)=(标注数量，6)=(nt, image_index + cls_id + bbox)# Build targets for compute_loss(), input targets(image,class,x,y,w,h)na, nt = self.na, targets.shape[0]  # number of anchors=3, targets=GT的数量, 后面使用例子的时候假设nt=5 indices, anch = [], []gain = s(7, device=targets.device).long()  # normalized to gridspace gain=(1,1,1,1,1,1,1)ai = torch.arange(na, device=targets.device).float().view(na, 1).repeat(1, nt)  # same as .repeat_interleave(nt)=(3, nt)=[[0.,0.,0.,0.,0.], [1.,1.,1.,1.,1.], [2.,2.,2.,2.,2.]]targets = torch.cat((peat(na, 1, 1), ai[:, :, None]), 2)  # append anchor peat(na, 1, 1).shape=(3,nt,6), ai[:, :, None].shape=(3,nt,1)# targets.shape=(3,nt,7) ,相当于在原来的targets后面加上[0.,0.,0.,0.,0.], [1.,1.,1.,1.,1.], [2.,2.,2.,2.,2.]=(nt, image_index + cls_id + bbox + anchor_index)g = 0.5  # biasoff = sor([[0, 0],[1, 0], [0, 1], [-1, 0], [0, -1],  # j,k,l,m# [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm], device=targets.device).float() * g  # offsetsfor i in range(self.nl): # 对每个检测层进行处理anchors = self.anchors[i]  # shape=(3,2); stride=32时，anchors=[[10,13], [16,30], [33,23]]gain[2:6] = sor(p[i].shape)[[3, 2, 3, 2]]  # xyxy gain， p[i].shape = (b, 3, h, w，nc+5), hw分别为特征图的长宽# gain = [1, 1, w, h, w, h, 1]# Match targets to anchorst = targets * gain # 将GT框的xywh从基于0~1映射到基于特征图，标签框的值是0~1，乘以gain，即乘以每个特征图（stride）的大小, (3, nt, image_index + cls_id + bbox + anchor_index)if nt:# Matchesr = t[:, :, 4:6] / anchors[:, None]  # wh ratio, j.shape=(3, nt)"""预测的wh与anchor的wh做匹配，筛选掉比值大于hyp['anchor_t']的(这应该是yolov5的创新点)，从而更好的回归(与新的边框回归方式有关)由于yolov3回归wh采用的是out=exp(in)，这很危险，因为out=exp(in)可能会无穷大，就会导致失控的梯度，不稳定，NaN损失并最终完全失去训练；(当然原yolov3采用的是将targets进行反算来求in与网络输出的结果，就问题不大，但采用iou loss，就需要将网络输出算成out来进行loss求解，所以会面临这个问题)；所以作者采用新的wh回归方式:(wh.sigmoid() * 2) ** 2 * anchors[i], 原来yolov3为anchors[i] * exp(wh)将标签框与anchor的倍数控制在0~4之间；hyp.scratch.yaml中的超参数anchor_t=4，所以也是通过此参数来判定anchors与标签框契合度；"""j = torch.max(r, 1. / r).max(2)[0] < self.hyp['anchor_t']  # compare 预测框的wh与anchor的wh，筛选掉比值大于hyp['anchor_t']=4.0,即正样本为：预测框wh/anchor wh<4.0# j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))t = t[j]  # filter, j.shape=(3, nt) # 筛选过后的t.shape = (M, 7),M为筛选过后的数量"""筛选满足1 / hyp['anchor_t'] < targets_wh/anchor_wh < hyp['anchor_t']的框;由于wh回归公式中将标签框与anchor的倍数控制在0~4之间，所以这样筛选之后也会浪费一些输出空间；由于分给每个特征金字塔层的anchor尺度都不一样，这里根据标签wh与anchor的wh的比例分配标签，就相当于把不同尺度的GT分配给了不同的特征层来回归；"""# Offsetsgxy = t[:, 2:4]  # grid xy # 得到中心点坐标xy(相对于左上角的), (M, 2)gxi = gain[[2, 3]] - gxy  # inverse # 得到中心点相对于右下角的坐标, (M, 2), gxy+gxi=stride"""把相对于各个网格左上角x<0.5,y<0.5和相对于右下角的x<0.5,y<0.5的框提取出来；也就是j,k,l,m，在选取gij(也就是标签框分配给网格的时候)对这四个部分的框都做一个偏移(减去上面的off),也就是下面的gij = (gxy - offsets).long()操作；再将这四个部分的框与原始的gxy拼接在一起，总共就是五个部分；也就是说：①将每个网格按照2x2分成四个部分，每个部分的框不仅采用当前网格的anchor进行回归，也采用该部分相邻的两个网格的anchor进行回归；原yolov3就仅仅采用当前网格的anchor进行回归；估计是用来缓解网格效应，但由于v5没发论文，所以也只是推测，yolov4也有相关解决网格效应的措施，是通过对sigmoid输出乘以一个大于1的系数；这也与yolov5新的边框回归公式相关；由于①，所以中心点回归也从yolov3的0~1的范围变成-0.5~1.5的范围；所以中心点回归的公式变为：xy.sigmoid() * 2. - 0.5 + cx"""# x小于0.5就靠近左边的网格，y小于0.5就靠近上边的网格# gxy % 1. 意思就是求得坐标xy后的小数点，也就是相对每个网格的偏移量 j代表x，k代表y# 参考博文, k = ((gxy % 1. < g) & (gxy > 1.)).T # gxy % 1.求得坐标xy后的小数点,也就是相对每个网格的偏移量 j代表x，k代表yl, m = ((gxi % 1. < g) & (gxi > 1.)).T # ((gxi % 1. < g) & (gxi > 1.)).T shape为(2, M);  jklm shape为(M, ) j = torch.stack((s_like(j), j, k, l, m)) # s_like函数基本功能是根据给定张量，生成与其形状相同的全1张量,j.shape=(5,M)t = t.repeat((5, 1, 1))[j] # t.shape = (5, M, 7) 得到筛选的框(N, 7), N为筛选后的个数. # 对t复制5份，即本身点外加上下左右四个候选区共五个区域，选出三份，具体选出哪三份？由torch.stack后的j决定，第一项是s_like,即全1矩阵，说明本身是必选中状态的。剩下的4项中，由于是inverse操作，所以j和l，k和m是两两互斥的。这样就确保了只选出三项.offsets = (s_like(gxy)[None] + off[:, None])[j] # (1, M, 2) + (5, 1, 2) = (5, M, 2) --[j]--> (N, 2)else:t = targets[0]offsets = 0# Defineb, c = t[:, :2].long().T  # image, class, b表示当前bbox属于该batch内第几张图片gxy = t[:, 2:4]  # grid xy 真实目标框的xy坐标gwh = t[:, 4:6]  # grid wh 真实目标框的宽高gij = (gxy - offsets).long() #.long()为取整gi, gj = gij.T  # grid xy indices # (gi,gj)是我们计算出来的负责预测该gt box的网格的坐标。# Appenda = t[:, 6].long()  # anchor indices a表示当前gt box和当前层的第几个anchor index匹配上了indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1)))  # image, anchor index, grid indicesanch.append(anchors[a])  # anchorsreturn indices, anch
本文发布于:2024-01-31 15:37:25，感谢您对本站的认可！
本文链接：https://www.4u4v.net/it/170668664829572.html
上一篇：GJ100
下一篇：跳位、合位、电源监视继电器JZ
标签：详解代码 loss
留言与评论（共有 0 条评论）