img_size = [H, W, C] = [224, 224, 3] # 图片尺寸，代码中的img_size为[H, W]
patch_size = [16, 16] # patch尺寸
num_patches = 224/16 * 224/16 = 14*14 = 196 # patch数
embed_dim = 768 # 一个patch_embed的长度
batch_size = B # batch_size简写，我选的是16。
camera_num = 1 # 不启用TransReID的相机SIE
view_num = 3 # 启用view_num，但是表示模态数，三模态。

模型构造

`class build_transformer()` - model/make_model.py

def __init__(self, num_classes, camera_num, view_num, cfg, factory):
  self.base = TransReID()
  self.base.load_param()
  self.gap = nn.AdaptiveAvgPool2d(1)
  self.classifier = arcface/cosface/amsoftmax/circle/nn.Linear(in_planes=768, num_classes, bias=False) 
  self.bottleneck = BatchNorm1d(768)

def forward(self, x, label=None, cam_label= None, view_label=None):
  # x:[B, 3, 224, 224]
  global_feat = self.base(x, cam, view) # ->[B, 768]
  feat = self.bottleneck(global_feat) # ->[B, 768]
  if training:
    cls_score = self.classifier(feat(, label)) # ->[B, num_classes] classifier为nn.Linear时不用(, label)
    return cls_score, global_feat # [B, num_classes], [B, 768] 
  else: 
    return feat/global_feat

`class TransReID()` - model/backbones/vit_pytorch.py

def __init__():
  self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) # [1, 1, 768]
  self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) # [1, 197, 768]
  self.patch_embed = PatchEmbed() # [B, 196, 768]
  self.sie_embed = nn.Parameter(torch.zeros(view, 1, embed_dim)) # [3, 1, 768]
  self.pos_drop = nn.Dropout()
  self.blocks = [Block() for i in range(depth)]
  self.norm = norm_layer(embed_dim) # [768]
  self.fc = nn.Linear(embed_dim, num_classes) # [768, 1000]
  trunc_normal_(cls_token与pos_embed)

def forward(self, x, cam_label=None, view_label=None):
  x = self.forward_features(x, cam_label, view_label)
  return x
def forward_features(self, x, camera_id, view_id):
  # x:[B, 3, 224, 224]
  x = self.patch_embed(x) # ->[B, 196, 768]，由[B, 3, 224, 224]的img经过切patch及线性投影转变而来
  cls_tokens = self.cls_token.expand(x.shape[0], -1, -1) # ->[1*B, 1, 768], 重复B个cls_tokens
  x = torch.cat((cls_tokens, x), dim=1) # ->[B, 1+196, 768]
  x = x + self.pos_embed + self.sie_xishu * self.sie_embed[view_id] # ->[B, 197, 768]，广播合并。
  x = self.pos_drop(x) # 应用dropout
  for blk in self.blocks: # 不使用TransReID的JPM的return分支。
    x = blk(x) # ->[B, 197, 768]
  x = self.norm(x)
  # [B, cls_token + patch_embed_1 + patch_embed_2 + ... + patch_embed_196, embed_dimension](B, 1+197, 768)
  # ->[B, cls_token, embed_dimension](B, 1, 768) -> [B, embed_dimension](B, 768)
  # 即只取出每个batch里的cls_token，我还以为每个patch都参与分类，原来就cls参与，因为forward()不好调试我还写了代码来确定我这样是不是对的(我错了，我只是还没debug到forward那边)。
  # 这个是ViT的原文：Similar to BERT’s [class] token, we prepend a learnable embedding to the sequence of embedded patches (z00 = xclass), whose state at the output of the Transformer encoder (z0L) serves as theimage representation y (Eq. 4).
  # 之前看有解说视频吐槽这个cls是没用的，然后TransReID的图强调了他们改了这个cls，我一直误以为这玩意真的没啥用，没想到最后是唯一用上的...
  return x[:, 0] # [B, 768]

`class PatchEmbed()` - model/backbones/vit_pytorch.py

def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768)
  self.proj = nn.Conv2dd(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
def forward(self, x):
  B, C, H, W = x.shape # [B, 3, 224, 224]
  # [B, C, H, W] -> [B, embed_dim(C2), H/patch_size(H2), W/patch_size(W2)] -> [B, C2, H2W2] -> [B, H2W2, C2]
  # 实际上是[Batch_size, num_patches, embed_dim]，即[B, 196, 768]
  x = self.proj(x).flatten(2).transpose(1, 2) 
  return x

`class Block()` - model/backbones/vit_pytorch.py

def __init__(dim=768):
  self.norm1 = norm_layer(dim)
  self.attn = Attention() # 输入[B, 197, 768]，输出[B, 197, 768]
  self.drop_path = DropPath()
  self.norm2 = norm_layer(dim)
  self.mlp = Mlp() # 输入[B, 197, 768]，输出[B, 197, 768]
def forward(self, x):
  # x:[B, 197, 768]
  x = x + self.drop_path(self.attn(self.norm1(x))) # 正则、自注意力、正则、残差
  x = x + self.drop_path(self.mlp(self.norm2(x))) # 正则、MLP、正则、残差
  return x # [B, 197, 768]

`class Attention()` - model/backbones/vit_pytorch.py

def __init__(self, dim=768, num_heads=12, ):
  self.qkv = nn.Linear(dim, dim * 3, bias=True)
  self.attn_drop = nn.Dropout(attn_drop)
  self.proj = nn.Linear(dim, dim)
  self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
  B, N, C = x.shape # [B, 197, 768]
  # x -> qkv: [B, 197, 768] -> [B, 197, 768*3] -> [B, 197, 3, 12, 64] -> [3, B, 12, 197, 64]
  # 即[q+k+v, Batch_size, num_heads, cls_tokens+patches_embed, head_channel],.
  # 我看Transformer论文里好像没提到一个head里面的数据叫什么，不过既然它是由一个channel按head平分的，那就叫head_channel吧。
  # 以及这里的channel应该指的是Transformer里的dimension，即d_model = 512
  qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C//self.num_heads).permute(2, 0, 3, 1, 4)
  q, k, v = qkv[0], qkv[1], qkv[2] # q,k,v均为[B, 12, 197, 64]
  attn = (q @ k.transpose(-2, -1)) * self.scale # [B, 12, 197, 64] @ [B, 12, 64, 197] -> [B, 12, 197, 197]，QK矩阵乘法并放缩值域
  attn = attn.softmax(dim=-1) # 最后一维进行softmax
  attn = self.attn_drop(attn) # 应用dropout
  # [B, 12, 197, 197] * [B, 12, 197, 64] -> [B, 12, 197, 64] -> [B, 197, 12, 64] -> [B, 197, 768]
  x = (attn @ v).transpose(1, 2).reshape(B, N, C)
  x = self.proj(x) # ->[B, 197, 768] 对(q@k)@v来一次全连接层，并dropout。
  x = self.proj_drop(x) # 应用dropout
  return x # [B, 197, 768]

`class Mlp()` - model/backbones/vit_pytorch.py

def __init__(in_features=768, hidden_features=768*4, act_layer=nn.GELU, ):
  out_features = out_features or in_features
  hidden_features = hidden_features or in_features
  self.fc1 = nn.Linear(in_features, hidden_features)
  self.act = act_layer()
  self.fc2 = nn.Linear(hidden_features, out_features)
  self.drop = nn.Dropout()
def forward(self, x):
  # x:[B, 197, 768]
  x = self.fc1(x) # ->[B, 197, 768*4]
  x = self.act(x) # 应用GELU激活函数
  x = self.drop(x) # 应用dropout
  x = self.fc2(x) # ->[B, 197, 768]
  x = self.drop(x) # 应用dropout
  return x # [B, 197, 768]

损失函数构造 - 略

`make_loss()` - loss/make_loss.py

center_criterion = CenterLoss(num_classes=num_classes, feat_dim=2048, use_gpu=True)
triplet = TripletLoss()
return loss_func, center_criterion

`class CenterLoss()` - loss/center_loss.py

def __init__(self, num_classes=751, feat_dim=2048, use_gpu=True):
  self.centers = nn.Parameter(torch.randn(self.num_classes, seflf.feat_dim))
def forward(self, x, labels):
  # x:[B, feat_dim], feature matrix.
  # labels:[num_classes] with truth labels.

优化器配置 - 略

`make_optimizer()` - solver/make_optimizer.py

# SGD
optimizer = getattr(torch.optim, cfg.SOLVER.OPTIMIZER_NAME)(params, momentum=cfg.SOLVER.MOMENTUM)
optimizer_center = torch.optim.SGD(center_criterion.parameters(), lr=cfg.SOLVER.CENTER_LR)
return optimizer, optimizer_center

调度器配置 - 略

`create_scheduler()` - solver/scheduler_factory.py

ls_scheduler = CosineLRScheduler()
return le_scheduler

模型训练

`do_train()` - processor/processor.py

scaler = amp.GradScaler()
for epoch in range(1, epochs + 1):
  model.train()
  scheduler.step(epoch)
  for n_iter, (img, vid, target_cam, target_view) in enumerate(train_loader):
    """
    # img:[B, 3, 224, 224], B张图片，3通道，H=224，W=224，经 数据集加载 里的train_transforms处理
    # vid:[B]，B张图片对应的B个id
    # target_cam:[B]，B张图片对应的B个cam_id
    # target_view:[B]，B张图片对应的B个view_id
    """
    score, feat = model(img, vid, target_cam, target_view ) # 得到分类结果score:[B, num_classes]，和特征feat:[B, 768]
    loss = loss_fn(score, feat, vid, target_cam) # 计算损失
    scaler.scale(loss).backward() # 梯度后向传播
    acc = (score.max(1)[1] == target).float().mean() # 求accuracy
  模型存储
  model.eval()
  for n_iter, (img, vid, camid, camids, target_view, _) in enumerate(val_loader):
    feat = model(img, camids, target_view)
    evaluator.update((feat, vid, camid))
  cmc, mAP, _, _, _, _, _ = evaluator.compute()

版权声明：
作者：MWHLS
链接：https://mwhls.top/3747.html
来源：无镣之涯
文章版权归作者所有，未经允许请勿转载。

THE END

python transformer 机器学习

二维码

打赏

译（五十四）-RuntimeError Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same

< <上一篇

译（五十五）-Python的append()和extend()有何区别

下一篇>>

搜索内容

ViT(TransReID)模型各阶段形状

代码

参数设置：

模型构造

`class build_transformer()` - model/make_model.py

`class TransReID()` - model/backbones/vit_pytorch.py

`class PatchEmbed()` - model/backbones/vit_pytorch.py

`class Block()` - model/backbones/vit_pytorch.py

`class Attention()` - model/backbones/vit_pytorch.py

`class Mlp()` - model/backbones/vit_pytorch.py

损失函数构造 - 略

`make_loss()` - loss/make_loss.py

`class CenterLoss()` - loss/center_loss.py

优化器配置 - 略

`make_optimizer()` - solver/make_optimizer.py

调度器配置 - 略

`create_scheduler()` - solver/scheduler_factory.py

模型训练

`do_train()` - processor/processor.py

取消回复

共有 1 条评论

分类

月份归档

标签云

最新评论

2026年 7月
一	二	三	四	五	六	日
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31