classDETR(nn.Module):""" This is the DETR module that performs object detection """def__init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False):""" Initializes the model.
Parameters:
backbone: torch module of the backbone to be used. See backbone.py
transformer: torch module of the transformer architecture. See transformer.py
num_classes: number of object classes
num_queries: number of object queries, ie detection slot. This is the maximal number of objects
DETR can detect in a single image. For COCO, we recommend 100 queries.
aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
"""super().__init__()
self.num_queries = num_queries
self.transformer = transformer
hidden_dim = transformer.d_model
# 分类
self.class_embed = nn.Linear(hidden_dim, num_classes +1)# 回归
self.bbox_embed = MLP(hidden_dim, hidden_dim,4,3)# self.query_embed 类似于传统目标检测里面的anchor 这里设置了100个 [100,256]# nn.Embedding 等价于 nn.Parameter
self.query_embed = nn.Embedding(num_queries, hidden_dim)
self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)
self.backbone = backbone
self.aux_loss = aux_loss # Truedefforward(self, samples: NestedTensor):""" The forward expects a NestedTensor, which consists of:
- samples.tensor: batched images, of shape [batch_size x 3 x H x W]
- samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
It returns a dict with the following elements:
- "pred_logits": the classification logits (including no-object) for all queries.
Shape= [batch_size x num_queries x (num_classes + 1)]
- "pred_boxes": The normalized boxes coordinates for all queries, represented as
(center_x, center_y, height, width). These values are normalized in [0, 1],
relative to the size of each individual image (disregarding possible padding).
See PostProcess for information on how to retrieve the unnormalized bounding box.
- "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
dictionnaries containing the two above keys for each decoder layer.
"""ifisinstance(samples,(list, torch.Tensor)):
samples = nested_tensor_from_tensor_list(samples)# out: list{0: tensor=[bs,2048,19,26] + mask=[bs,19,26]} 经过backbone resnet50 block5输出的结果# pos: list{0: [bs,256,19,26]} 位置编码
features, pos = self.backbone(samples)# src: Tensor [bs,2048,19,26]# mask: Tensor [bs,19,26]
src, mask = features[-1].decompose()assert mask isnotNone# 数据输入transformer进行前向传播# self.input_proj(src) [bs,2048,19,26]->[bs,256,19,26]# mask: False的区域是不需要进行注意力计算的# self.query_embed.weight 类似于传统目标检测里面的anchor 这里设置了100个# pos[-1] 位置编码 [bs, 256, 19, 26]# hs: [6, bs, 100, 256]
hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]# 分类 [6个decoder, bs, 100, 256] -> [6, bs, 100, 92(类别)]
outputs_class = self.class_embed(hs)# 回归 [6个decoder, bs, 100, 256] -> [6, bs, 100, 4]
outputs_coord = self.bbox_embed(hs).sigmoid()
out ={'pred_logits': outputs_class[-1],'pred_boxes': outputs_coord[-1]}if self.aux_loss:# True
out['aux_outputs']= self._set_aux_loss(outputs_class, outputs_coord)# dict: 3# 0 pred_logits 分类头输出[bs, 100, 92(类别数)]# 1 pred_boxes 回归头输出[bs, 100, 4]# 3 aux_outputs list: 5 前5个decoder层输出 5个pred_logits[bs, 100, 92(类别数)] 和 5个pred_boxes[bs, 100, 4]return out
@torch.jit.unuseddef_set_aux_loss(self, outputs_class, outputs_coord):# this is a workaround to make torchscript happy, as torchscript# doesn't support dictionary with non-homogeneous values, such# as a dict having both a Tensor and a list.return[{'pred_logits': a,'pred_boxes': b}for a, b inzip(outputs_class[:-1], outputs_coord[:-1])]