ViT(TransReID)模型各阶段形状
之前看TransReID代码时的记录,他们代码写的很全,训练测试都有。
这段时间在这个基础上改了点代码,暂时没出现模型理解错的地方。
ViT外的改进没有记录。有错误或疑问请留言,谢谢。
目录
代码
- TransReID-GitHub
- 下面是TransReID里的ViT结构,不包括TransReID的改进。
- 代码简写了。
参数设置:
img_size = [H, W, C] = [224, 224, 3] # 图片尺寸,代码中的img_size为[H, W]
patch_size = [16, 16] # patch尺寸
num_patches = 224/16 * 224/16 = 14*14 = 196 # patch数
embed_dim = 768 # 一个patch_embed的长度
batch_size = B # batch_size简写,我选的是16。
camera_num = 1 # 不启用TransReID的相机SIE
view_num = 3 # 启用view_num,但是表示模态数,三模态。
模型构造
class build_transformer()
- model/make_model.py
def __init__(self, num_classes, camera_num, view_num, cfg, factory):
self.base = TransReID()
self.base.load_param()
self.gap = nn.AdaptiveAvgPool2d(1)
self.classifier = arcface/cosface/amsoftmax/circle/nn.Linear(in_planes=768, num_classes, bias=False)
self.bottleneck = BatchNorm1d(768)
def forward(self, x, label=None, cam_label= None, view_label=None):
# x:[B, 3, 224, 224]
global_feat = self.base(x, cam, view) # ->[B, 768]
feat = self.bottleneck(global_feat) # ->[B, 768]
if training:
cls_score = self.classifier(feat(, label)) # ->[B, num_classes] classifier为nn.Linear时不用(, label)
return cls_score, global_feat # [B, num_classes], [B, 768]
else:
return feat/global_feat
class TransReID()
- model/backbones/vit_pytorch.py
def __init__():
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) # [1, 1, 768]
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) # [1, 197, 768]
self.patch_embed = PatchEmbed() # [B, 196, 768]
self.sie_embed = nn.Parameter(torch.zeros(view, 1, embed_dim)) # [3, 1, 768]
self.pos_drop = nn.Dropout()
self.blocks = [Block() for i in range(depth)]
self.norm = norm_layer(embed_dim) # [768]
self.fc = nn.Linear(embed_dim, num_classes) # [768, 1000]
trunc_normal_(cls_token与pos_embed)
def forward(self, x, cam_label=None, view_label=None):
x = self.forward_features(x, cam_label, view_label)
return x
def forward_features(self, x, camera_id, view_id):
# x:[B, 3, 224, 224]
x = self.patch_embed(x) # ->[B, 196, 768],由[B, 3, 224, 224]的img经过切patch及线性投影转变而来
cls_tokens = self.cls_token.expand(x.shape[0], -1, -1) # ->[1*B, 1, 768], 重复B个cls_tokens
x = torch.cat((cls_tokens, x), dim=1) # ->[B, 1+196, 768]
x = x + self.pos_embed + self.sie_xishu * self.sie_embed[view_id] # ->[B, 197, 768],广播合并。
x = self.pos_drop(x) # 应用dropout
for blk in self.blocks: # 不使用TransReID的JPM的return分支。
x = blk(x) # ->[B, 197, 768]
x = self.norm(x)
# [B, cls_token + patch_embed_1 + patch_embed_2 + ... + patch_embed_196, embed_dimension](B, 1+197, 768)
# ->[B, cls_token, embed_dimension](B, 1, 768) -> [B, embed_dimension](B, 768)
# 即只取出每个batch里的cls_token,我还以为每个patch都参与分类,原来就cls参与,因为forward()不好调试我还写了代码来确定我这样是不是对的(我错了,我只是还没debug到forward那边)。
# 这个是ViT的原文:Similar to BERT’s [class] token, we prepend a learnable embedding to the sequence of embedded patches (z00 = xclass), whose state at the output of the Transformer encoder (z0L) serves as theimage representation y (Eq. 4).
# 之前看有解说视频吐槽这个cls是没用的,然后TransReID的图强调了他们改了这个cls,我一直误以为这玩意真的没啥用,没想到最后是唯一用上的...
return x[:, 0] # [B, 768]
class PatchEmbed()
- model/backbones/vit_pytorch.py
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768)
self.proj = nn.Conv2dd(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
def forward(self, x):
B, C, H, W = x.shape # [B, 3, 224, 224]
# [B, C, H, W] -> [B, embed_dim(C2), H/patch_size(H2), W/patch_size(W2)] -> [B, C2, H2W2] -> [B, H2W2, C2]
# 实际上是[Batch_size, num_patches, embed_dim],即[B, 196, 768]
x = self.proj(x).flatten(2).transpose(1, 2)
return x
class Block()
- model/backbones/vit_pytorch.py
def __init__(dim=768):
self.norm1 = norm_layer(dim)
self.attn = Attention() # 输入[B, 197, 768],输出[B, 197, 768]
self.drop_path = DropPath()
self.norm2 = norm_layer(dim)
self.mlp = Mlp() # 输入[B, 197, 768],输出[B, 197, 768]
def forward(self, x):
# x:[B, 197, 768]
x = x + self.drop_path(self.attn(self.norm1(x))) # 正则、自注意力、正则、残差
x = x + self.drop_path(self.mlp(self.norm2(x))) # 正则、MLP、正则、残差
return x # [B, 197, 768]
class Attention()
- model/backbones/vit_pytorch.py
def __init__(self, dim=768, num_heads=12, ):
self.qkv = nn.Linear(dim, dim * 3, bias=True)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape # [B, 197, 768]
# x -> qkv: [B, 197, 768] -> [B, 197, 768*3] -> [B, 197, 3, 12, 64] -> [3, B, 12, 197, 64]
# 即[q+k+v, Batch_size, num_heads, cls_tokens+patches_embed, head_channel],.
# 我看Transformer论文里好像没提到一个head里面的数据叫什么,不过既然它是由一个channel按head平分的,那就叫head_channel吧。
# 以及这里的channel应该指的是Transformer里的dimension,即d_model = 512
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C//self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # q,k,v均为[B, 12, 197, 64]
attn = (q @ k.transpose(-2, -1)) * self.scale # [B, 12, 197, 64] @ [B, 12, 64, 197] -> [B, 12, 197, 197],QK矩阵乘法并放缩值域
attn = attn.softmax(dim=-1) # 最后一维进行softmax
attn = self.attn_drop(attn) # 应用dropout
# [B, 12, 197, 197] * [B, 12, 197, 64] -> [B, 12, 197, 64] -> [B, 197, 12, 64] -> [B, 197, 768]
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x) # ->[B, 197, 768] 对(q@k)@v来一次全连接层,并dropout。
x = self.proj_drop(x) # 应用dropout
return x # [B, 197, 768]
class Mlp()
- model/backbones/vit_pytorch.py
def __init__(in_features=768, hidden_features=768*4, act_layer=nn.GELU, ):
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout()
def forward(self, x):
# x:[B, 197, 768]
x = self.fc1(x) # ->[B, 197, 768*4]
x = self.act(x) # 应用GELU激活函数
x = self.drop(x) # 应用dropout
x = self.fc2(x) # ->[B, 197, 768]
x = self.drop(x) # 应用dropout
return x # [B, 197, 768]
损失函数构造 - 略
make_loss()
- loss/make_loss.py
center_criterion = CenterLoss(num_classes=num_classes, feat_dim=2048, use_gpu=True)
triplet = TripletLoss()
return loss_func, center_criterion
class CenterLoss()
- loss/center_loss.py
def __init__(self, num_classes=751, feat_dim=2048, use_gpu=True):
self.centers = nn.Parameter(torch.randn(self.num_classes, seflf.feat_dim))
def forward(self, x, labels):
# x:[B, feat_dim], feature matrix.
# labels:[num_classes] with truth labels.
优化器配置 - 略
make_optimizer()
- solver/make_optimizer.py
# SGD
optimizer = getattr(torch.optim, cfg.SOLVER.OPTIMIZER_NAME)(params, momentum=cfg.SOLVER.MOMENTUM)
optimizer_center = torch.optim.SGD(center_criterion.parameters(), lr=cfg.SOLVER.CENTER_LR)
return optimizer, optimizer_center
调度器配置 - 略
create_scheduler()
- solver/scheduler_factory.py
ls_scheduler = CosineLRScheduler()
return le_scheduler
模型训练
do_train()
- processor/processor.py
scaler = amp.GradScaler()
for epoch in range(1, epochs + 1):
model.train()
scheduler.step(epoch)
for n_iter, (img, vid, target_cam, target_view) in enumerate(train_loader):
"""
# img:[B, 3, 224, 224], B张图片,3通道,H=224,W=224,经 数据集加载 里的train_transforms处理
# vid:[B],B张图片对应的B个id
# target_cam:[B],B张图片对应的B个cam_id
# target_view:[B],B张图片对应的B个view_id
"""
score, feat = model(img, vid, target_cam, target_view ) # 得到分类结果score:[B, num_classes],和特征feat:[B, 768]
loss = loss_fn(score, feat, vid, target_cam) # 计算损失
scaler.scale(loss).backward() # 梯度后向传播
acc = (score.max(1)[1] == target).float().mean() # 求accuracy
模型存储
model.eval()
for n_iter, (img, vid, camid, camids, target_view, _) in enumerate(val_loader):
feat = model(img, camids, target_view)
evaluator.update((feat, vid, camid))
cmc, mAP, _, _, _, _, _ = evaluator.compute()
上海seo
很专业。。。