mkang315 commited on Jul 31

Commit

7757a1a

verified ·

1 Parent(s): 1a8eb3e

Upload 123 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
data/hyps/hyp.scratch-high.yaml +30 -0
data/images/horses.jpg +3 -0
data/multiplane.yaml +9 -0
models/__init__.py +1 -0
models/common.py +1296 -0
models/detect/pk-yolo.yaml +126 -0
models/detect/yolov9-e.yaml +144 -0
models/experimental.py +275 -0
models/repvit.py +440 -0
models/tf.py +596 -0
models/yolo.py +771 -0
spark repvit/repvit_1kpretrained_timm_style.pth +3 -0
spark/downstream_d2/README.md +101 -0
spark/downstream_d2/configs/Base-RCNN-FPN.yaml +42 -0
spark/downstream_d2/configs/coco_R_50_FPN_CONV_1x_moco_adam.yaml +57 -0
spark/downstream_d2/convert-timm-to-d2.py +43 -0
spark/downstream_d2/lr_decay.py +132 -0
spark/downstream_d2/train_net.py +322 -0
spark/downstream_imagenet/README.md +54 -0
spark/downstream_imagenet/arg.py +137 -0
spark/downstream_imagenet/data.py +151 -0
spark/downstream_imagenet/lr_decay.py +61 -0
spark/downstream_imagenet/main.py +189 -0
spark/downstream_imagenet/mixup.py +168 -0
spark/downstream_imagenet/models/__init__.py +104 -0
spark/downstream_imagenet/models/convnext_official.py +201 -0
spark/downstream_imagenet/requirements.txt +5 -0
spark/downstream_imagenet/util.py +131 -0
spark/downstream_mmdet/README.md +76 -0
spark/downstream_mmdet/configs/_base_/default_runtime.py +16 -0
spark/downstream_mmdet/configs/_base_/models/cascade_mask_rcnn_convnext_fpn.py +208 -0
spark/downstream_mmdet/configs/_base_/models/mask_rcnn_convnext_fpn.py +128 -0
spark/downstream_mmdet/configs/convnext_spark/mask_rcnn_convnext_base_patch4_window7_mstrain_480-800_adamw_3x_coco_in1k.py +95 -0
spark/downstream_mmdet/mmcv_custom/__init__.py +15 -0
spark/downstream_mmdet/mmcv_custom/customized_text.py +130 -0
spark/downstream_mmdet/mmcv_custom/layer_decay_optimizer_constructor.py +123 -0
spark/downstream_mmdet/mmcv_custom/runner/checkpoint.py +85 -0
spark/downstream_mmdet/mmdet/models/backbones/__init__.py +20 -0
spark/downstream_mmdet/mmdet/models/backbones/convnext.py +180 -0
spark/pretrain/README.md +118 -0
spark/pretrain/decoder.py +74 -0
spark/pretrain/dist.py +118 -0
spark/pretrain/encoder.py +208 -0
spark/pretrain/main.py +191 -0
spark/pretrain/models/__init__.py +62 -0
spark/pretrain/models/convnext.py +125 -0
spark/pretrain/models/custom.py +141 -0
spark/pretrain/models/custom_detr.py +102 -0
spark/pretrain/models/custom_origin.py +89 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/images/horses.jpg filter=lfs diff=lfs merge=lfs -text
+spark/pretrain/viz_imgs/recon.png filter=lfs diff=lfs merge=lfs -text

data/hyps/hyp.scratch-high.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+lr0: 0.01  # initial learning rate (SGD=1E-2, Adam=1E-3)
+lrf: 0.01  # final OneCycleLR learning rate (lr0 * lrf)
+momentum: 0.937  # SGD momentum/Adam beta1
+weight_decay: 0.0005  # optimizer weight decay 5e-4
+warmup_epochs: 3.0  # warmup epochs (fractions ok)
+warmup_momentum: 0.8  # warmup initial momentum
+warmup_bias_lr: 0.1  # warmup initial bias lr
+box: 7.5  # box loss gain
+cls: 0.5  # cls loss gain
+cls_pw: 1.0  # cls BCELoss positive_weight
+obj: 0.7  # obj loss gain (scale with pixels)
+obj_pw: 1.0  # obj BCELoss positive_weight
+dfl: 1.5  # dfl loss gain
+iou_t: 0.20  # IoU training threshold
+anchor_t: 5.0  # anchor-multiple threshold
+# anchors: 3  # anchors per output layer (0 to ignore)
+fl_gamma: 0.0  # focal loss gamma (efficientDet default gamma=1.5)
+hsv_h: 0.015  # image HSV-Hue augmentation (fraction)
+hsv_s: 0.7  # image HSV-Saturation augmentation (fraction)
+hsv_v: 0.4  # image HSV-Value augmentation (fraction)
+degrees: 0.0  # image rotation (+/- deg)
+translate: 0.1  # image translation (+/- fraction)
+scale: 0.9  # image scale (+/- gain)
+shear: 0.0  # image shear (+/- deg)
+perspective: 0.0  # image perspective (+/- fraction), range 0-0.001
+flipud: 0.0  # image flip up-down (probability)
+fliplr: 0.5  # image flip left-right (probability)
+mosaic: 1.0  # image mosaic (probability)
+mixup: 0.15  # image mixup (probability)
+copy_paste: 0.3  # segment copy-paste (probability)

data/images/horses.jpg ADDED Viewed

Git LFS Details

SHA256: c8f0a677a1356569e2ce71d2fa88c1030c0ae57ecf5e14170e02d9a86a20dcb4
Pointer size: 131 Bytes
Size of remote file: 133 kB

data/multiplane.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+train: ./axial_t1wce_2_class/images/train
+val: ./axial_t1wce_2_class/images/test
+# train: ./coronal_t1wce_2_class/images/train
+# val: ./coronal_t1wce_2_class/images/test
+# train: ./sagittal_t1wce_2_class/images/train
+# val: ./sagittal_t1wce_2_class/images/test
+nc: 2
+names: ['negative','positive']

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # init

models/common.py ADDED Viewed

	@@ -0,0 +1,1296 @@

+import ast
+import contextlib
+import json
+import math
+import platform
+import warnings
+import zipfile
+from collections import OrderedDict, namedtuple
+from copy import copy
+from pathlib import Path
+from urllib.parse import urlparse
+from typing import Optional
+import cv2
+import numpy as np
+import pandas as pd
+import requests
+import torch
+import torch.nn as nn
+from IPython.display import display
+from PIL import Image
+from torch.cuda import amp
+from models.repvit import RepViT
+from utils import TryExcept
+from utils.dataloaders import exif_transpose, letterbox
+from utils.general import (LOGGER, ROOT, Profile, check_requirements, check_suffix, check_version, colorstr,
+                           increment_path, is_notebook, make_divisible, non_max_suppression, scale_boxes,
+                           xywh2xyxy, xyxy2xywh, yaml_load)
+from utils.plots import Annotator, colors, save_one_box
+from utils.torch_utils import copy_attr, smart_inference_mode
+def autopad(k, p=None, d=1):  # kernel, padding, dilation
+    # Pad to 'same' shape outputs
+    if d > 1:
+        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+class Conv(nn.Module):
+    # Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)
+    default_act = nn.SiLU()  # default activation
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
+        self.bn = nn.BatchNorm2d(c2)
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+    def forward_fuse(self, x):
+        return self.act(self.conv(x))
+class AConv(nn.Module):
+    def __init__(self, c1, c2):  # ch_in, ch_out, shortcut, kernels, groups, expand
+        super().__init__()
+        self.cv1 = Conv(c1, c2, 3, 2, 1)
+    def forward(self, x):
+        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
+        return self.cv1(x)
+class ADown(nn.Module):
+    def __init__(self, c1, c2):  # ch_in, ch_out, shortcut, kernels, groups, expand
+        super().__init__()
+        self.c = c2 // 2
+        self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)
+        self.cv2 = Conv(c1 // 2, self.c, 1, 1, 0)
+    def forward(self, x):
+        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
+        x1,x2 = x.chunk(2, 1)
+        x1 = self.cv1(x1)
+        x2 = torch.nn.functional.max_pool2d(x2, 3, 2, 1)
+        x2 = self.cv2(x2)
+        return torch.cat((x1, x2), 1)
+class RepConvN(nn.Module):
+    """RepConv is a basic rep-style block, including training and deploy status
+    This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
+    """
+    default_act = nn.SiLU()  # default activation
+    def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
+        super().__init__()
+        assert k == 3 and p == 1
+        self.g = g
+        self.c1 = c1
+        self.c2 = c2
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+        self.bn = None
+        self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False)
+        self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False)
+    def forward_fuse(self, x):
+        """Forward process"""
+        return self.act(self.conv(x))
+    def forward(self, x):
+        """Forward process"""
+        id_out = 0 if self.bn is None else self.bn(x)
+        return self.act(self.conv1(x) + self.conv2(x) + id_out)
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        kernelid, biasid = self._fuse_bn_tensor(self.bn)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+    def _avg_to_3x3_tensor(self, avgp):
+        channels = self.c1
+        groups = self.g
+        kernel_size = avgp.kernel_size
+        input_dim = channels // groups
+        k = torch.zeros((channels, input_dim, kernel_size, kernel_size))
+        k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2
+        return k
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, Conv):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        elif isinstance(branch, nn.BatchNorm2d):
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.c1 // self.g
+                kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
+                for i in range(self.c1):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+    def fuse_convs(self):
+        if hasattr(self, 'conv'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels,
+                              out_channels=self.conv1.conv.out_channels,
+                              kernel_size=self.conv1.conv.kernel_size,
+                              stride=self.conv1.conv.stride,
+                              padding=self.conv1.conv.padding,
+                              dilation=self.conv1.conv.dilation,
+                              groups=self.conv1.conv.groups,
+                              bias=True).requires_grad_(False)
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        self.__delattr__('conv1')
+        self.__delattr__('conv2')
+        if hasattr(self, 'nm'):
+            self.__delattr__('nm')
+        if hasattr(self, 'bn'):
+            self.__delattr__('bn')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+class SP(nn.Module):
+    def __init__(self, k=3, s=1):
+        super(SP, self).__init__()
+        self.m = nn.MaxPool2d(kernel_size=k, stride=s, padding=k // 2)
+    def forward(self, x):
+        return self.m(x)
+class MP(nn.Module):
+    # Max pooling
+    def __init__(self, k=2):
+        super(MP, self).__init__()
+        self.m = nn.MaxPool2d(kernel_size=k, stride=k)
+    def forward(self, x):
+        return self.m(x)
+class ConvTranspose(nn.Module):
+    # Convolution transpose 2d layer
+    default_act = nn.SiLU()  # default activation
+    def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
+        super().__init__()
+        self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
+        self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+    def forward(self, x):
+        return self.act(self.bn(self.conv_transpose(x)))
+class DWConv(Conv):
+    # Depth-wise convolution
+    def __init__(self, c1, c2, k=1, s=1, d=1, act=True):  # ch_in, ch_out, kernel, stride, dilation, activation
+        super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
+class DWConvTranspose2d(nn.ConvTranspose2d):
+    # Depth-wise transpose convolution
+    def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):  # ch_in, ch_out, kernel, stride, padding, padding_out
+        super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))
+class DFL(nn.Module):
+    # DFL module
+    def __init__(self, c1=17):
+        super().__init__()
+        self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
+        self.conv.weight.data[:] = nn.Parameter(torch.arange(c1, dtype=torch.float).view(1, c1, 1, 1)) # / 120.0
+        self.c1 = c1
+        # self.bn = nn.BatchNorm2d(4)
+    def forward(self, x):
+        b, c, a = x.shape  # batch, channels, anchors
+        return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
+        # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
+class BottleneckBase(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(1, 3), e=0.5):  # ch_in, ch_out, shortcut, kernels, groups, expand
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, k[0], 1)
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.add = shortcut and c1 == c2
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+class RBottleneckBase(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 1), e=0.5):  # ch_in, ch_out, shortcut, kernels, groups, expand
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, k[0], 1)
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.add = shortcut and c1 == c2
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+class RepNRBottleneckBase(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 1), e=0.5):  # ch_in, ch_out, shortcut, kernels, groups, expand
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = RepConvN(c1, c_, k[0], 1)
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.add = shortcut and c1 == c2
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):  # ch_in, ch_out, shortcut, kernels, groups, expand
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, k[0], 1)
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.add = shortcut and c1 == c2
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+class RepNBottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):  # ch_in, ch_out, shortcut, kernels, groups, expand
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = RepConvN(c1, c_, k[0], 1)
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.add = shortcut and c1 == c2
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+class Backbone(nn.Module):
+    def __init__(self):
+        super(Backbone, self).__init__()
+        self.cfgs = [
+            # k, t, c, SE, HS, s
+            [3, 2, 64 * 2, 1, 0, 1],
+            [3, 2, 64 * 2, 0, 0, 1],
+            [3, 2, 64 * 2, 1, 0, 1],
+            [3, 2, 64 * 2, 0, 0, 1],
+            [3, 2, 64 * 2, 0, 0, 1],
+            [3, 2, 128 * 2, 0, 0, 2],
+            [3, 2, 128 * 2, 1, 0, 1],
+            [3, 2, 128 * 2, 0, 0, 1],
+            [3, 2, 128 * 2, 1, 0, 1],
+            [3, 2, 128 * 2, 0, 0, 1],
+            [3, 2, 128 * 2, 0, 0, 1],
+            [3, 2, 256 * 2, 0, 1, 2],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 1, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 256 * 2, 0, 1, 1],
+            [3, 2, 512 * 2, 0, 1, 2],
+            [3, 2, 512 * 2, 1, 1, 1],
+            [3, 2, 512 * 2, 0, 1, 1],
+            [3, 2, 512 * 2, 1, 1, 1],
+            [3, 2, 512 * 2, 0, 1, 1]
+        ]
+        self.backbone =  RepViT(self.cfgs )
+    def forward(self, x):
+        outputs = self.backbone (x)
+        return outputs
+class Down0(nn.Module):
+    def __init__(self,inp):
+        super(Down0, self).__init__()
+    def forward(self, x):
+        return x[0]
+class Down1(nn.Module):
+    def __init__(self,inp):
+        super(Down1, self).__init__()
+    def forward(self, x):
+        return x[1]
+class Down2(nn.Module):
+    def __init__(self,inp):
+        super(Down2, self).__init__()
+    def forward(self, x):
+        return x[2]
+class Down3(nn.Module):
+    def __init__(self,inp):
+        super(Down3, self).__init__()
+    def forward(self, x):
+        return x[3]
+class Down4(nn.Module):
+    def __init__(self,inp):
+        super(Down4, self).__init__()
+    def forward(self, x):
+        return x[4]
+class Res(nn.Module):
+    # ResNet bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
+        super(Res, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_, c_, 3, 1, g=g)
+        self.cv3 = Conv(c_, c2, 1, 1)
+        self.add = shortcut and c1 == c2
+    def forward(self, x):
+        return x + self.cv3(self.cv2(self.cv1(x))) if self.add else self.cv3(self.cv2(self.cv1(x)))
+class RepNRes(nn.Module):
+    # ResNet bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
+        super(RepNRes, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = RepConvN(c_, c_, 3, 1, g=g)
+        self.cv3 = Conv(c_, c2, 1, 1)
+        self.add = shortcut and c1 == c2
+    def forward(self, x):
+        return x + self.cv3(self.cv2(self.cv1(x))) if self.add else self.cv3(self.cv2(self.cv1(x)))
+class BottleneckCSP(nn.Module):
+    # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
+        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
+        self.cv4 = Conv(2 * c_, c2, 1, 1)
+        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
+        self.act = nn.SiLU()
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
+    def forward(self, x):
+        y1 = self.cv3(self.m(self.cv1(x)))
+        y2 = self.cv2(x)
+        return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
+class CSP(nn.Module):
+    # CSP Bottleneck with 3 convolutions
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
+class RepNCSP(nn.Module):
+    # CSP Bottleneck with 3 convolutions
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.Sequential(*(RepNBottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
+class CSPBase(nn.Module):
+    # CSP Bottleneck with 3 convolutions
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.Sequential(*(BottleneckBase(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
+class SPP(nn.Module):
+    # Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729
+    def __init__(self, c1, c2, k=(5, 9, 13)):
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
+        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
+    def forward(self, x):
+        x = self.cv1(x)
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warning
+            return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
+class ASPP(torch.nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        kernel_sizes = [1, 3, 3, 1]
+        dilations = [1, 3, 6, 1]
+        paddings = [0, 3, 6, 0]
+        self.aspp = torch.nn.ModuleList()
+        for aspp_idx in range(len(kernel_sizes)):
+            conv = torch.nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_sizes[aspp_idx],
+                stride=1,
+                dilation=dilations[aspp_idx],
+                padding=paddings[aspp_idx],
+                bias=True)
+            self.aspp.append(conv)
+        self.gap = torch.nn.AdaptiveAvgPool2d(1)
+        self.aspp_num = len(kernel_sizes)
+        for m in self.modules():
+            if isinstance(m, torch.nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                m.bias.data.fill_(0)
+    def forward(self, x):
+        avg_x = self.gap(x)
+        out = []
+        for aspp_idx in range(self.aspp_num):
+            inp = avg_x if (aspp_idx == self.aspp_num - 1) else x
+            out.append(F.relu_(self.aspp[aspp_idx](inp)))
+        out[-1] = out[-1].expand_as(out[-2])
+        out = torch.cat(out, dim=1)
+        return out
+class SPPCSPC(nn.Module):
+    # CSP SPP https://github.com/WongKinYiu/CrossStagePartialNetworks
+    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, k=(5, 9, 13)):
+        super(SPPCSPC, self).__init__()
+        c_ = int(2 * c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(c_, c_, 3, 1)
+        self.cv4 = Conv(c_, c_, 1, 1)
+        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
+        self.cv5 = Conv(4 * c_, c_, 1, 1)
+        self.cv6 = Conv(c_, c_, 3, 1)
+        self.cv7 = Conv(2 * c_, c2, 1, 1)
+    def forward(self, x):
+        x1 = self.cv4(self.cv3(self.cv1(x)))
+        y1 = self.cv6(self.cv5(torch.cat([x1] + [m(x1) for m in self.m], 1)))
+        y2 = self.cv2(x)
+        return self.cv7(torch.cat((y1, y2), dim=1))
+class SPPF(nn.Module):
+    # Spatial Pyramid Pooling - Fast (SPPF) layer by Glenn Jocher
+    def __init__(self, c1, c2, k=5):  # equivalent to SPP(k=(5, 9, 13))
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_ * 4, c2, 1, 1)
+        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+        # self.m = SoftPool2d(kernel_size=k, stride=1, padding=k // 2)
+    def forward(self, x):
+        x = self.cv1(x)
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warning
+            y1 = self.m(x)
+            y2 = self.m(y1)
+            return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
+import torch.nn.functional as F
+from torch.nn.modules.utils import _pair
+class ReOrg(nn.Module):
+    # yolo
+    def __init__(self):
+        super(ReOrg, self).__init__()
+    def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
+        return torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1)
+class Contract(nn.Module):
+    # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40)
+    def __init__(self, gain=2):
+        super().__init__()
+        self.gain = gain
+    def forward(self, x):
+        b, c, h, w = x.size()  # assert (h / s == 0) and (W / s == 0), 'Indivisible gain'
+        s = self.gain
+        x = x.view(b, c, h // s, s, w // s, s)  # x(1,64,40,2,40,2)
+        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # x(1,2,2,64,40,40)
+        return x.view(b, c * s * s, h // s, w // s)  # x(1,256,40,40)
+class Expand(nn.Module):
+    # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160)
+    def __init__(self, gain=2):
+        super().__init__()
+        self.gain = gain
+    def forward(self, x):
+        b, c, h, w = x.size()  # assert C / s ** 2 == 0, 'Indivisible gain'
+        s = self.gain
+        x = x.view(b, s, s, c // s ** 2, h, w)  # x(1,2,2,16,80,80)
+        x = x.permute(0, 3, 4, 1, 5, 2).contiguous()  # x(1,16,80,2,80,2)
+        return x.view(b, c // s ** 2, h * s, w * s)  # x(1,16,160,160)
+class Concat(nn.Module):
+    # Concatenate a list of tensors along dimension
+    def __init__(self, dimension=1):
+        super().__init__()
+        self.d = dimension
+    def forward(self, x):
+        return torch.cat(x, self.d)
+class Shortcut(nn.Module):
+    def __init__(self, dimension=0):
+        super(Shortcut, self).__init__()
+        self.d = dimension
+    def forward(self, x):
+        return x[0]+x[1]
+class Silence(nn.Module):
+    def __init__(self):
+        super(Silence, self).__init__()
+    def forward(self, x):
+        return x
+##### GELAN #####
+class SPPELAN(nn.Module):
+    # spp-elan
+    def __init__(self, c1, c2, c3):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        self.c = c3
+        self.cv1 = Conv(c1, c3, 1, 1)
+        self.cv2 = SP(5)
+        self.cv3 = SP(5)
+        self.cv4 = SP(5)
+        self.cv5 = Conv(4*c3, c2, 1, 1)
+    def forward(self, x):
+        y = [self.cv1(x)]
+        y.extend(m(y[-1]) for m in [self.cv2, self.cv3, self.cv4])
+        return self.cv5(torch.cat(y, 1))
+class RepNCSPELAN4(nn.Module):
+    # csp-elan
+    def __init__(self, c1, c2, c3, c4, c5=1):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        self.c = c3//2
+        self.cv1 = Conv(c1, c3, 1, 1)
+        self.cv2 = nn.Sequential(RepNCSP(c3//2, c4, c5), Conv(c4, c4, 3, 1))
+        self.cv3 = nn.Sequential(RepNCSP(c4, c4, c5), Conv(c4, c4, 3, 1))
+        self.cv4 = Conv(c3+(2*c4), c2, 1, 1)
+    def forward(self, x):
+        y = list(self.cv1(x).chunk(2, 1))
+        y.extend((m(y[-1])) for m in [self.cv2, self.cv3])
+        return self.cv4(torch.cat(y, 1))
+    def forward_split(self, x):
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
+        return self.cv4(torch.cat(y, 1))
+#################
+##### YOLOR #####
+class ImplicitA(nn.Module):
+    def __init__(self, channel):
+        super(ImplicitA, self).__init__()
+        self.channel = channel
+        self.implicit = nn.Parameter(torch.zeros(1, channel, 1, 1))
+        nn.init.normal_(self.implicit, std=.02)
+    def forward(self, x):
+        return self.implicit + x
+class ImplicitM(nn.Module):
+    def __init__(self, channel):
+        super(ImplicitM, self).__init__()
+        self.channel = channel
+        self.implicit = nn.Parameter(torch.ones(1, channel, 1, 1))
+        nn.init.normal_(self.implicit, mean=1., std=.02)
+    def forward(self, x):
+        return self.implicit * x
+#################
+##### CBNet #####
+class CBLinear(nn.Module):
+    def __init__(self, c1, c2s, k=1, s=1, p=None, g=1):  # ch_in, ch_outs, kernel, stride, padding, groups
+        super(CBLinear, self).__init__()
+        self.c2s = c2s
+        self.conv = nn.Conv2d(c1, sum(c2s), k, s, autopad(k, p), groups=g, bias=True)
+    def forward(self, x):
+        outs = self.conv(x).split(self.c2s, dim=1)
+        return outs
+class CBFuse(nn.Module):
+    def __init__(self, idx):
+        super(CBFuse, self).__init__()
+        self.idx = idx
+    def forward(self, xs):
+        target_size = xs[-1].shape[2:]
+        res = [F.interpolate(x[self.idx[i]], size=target_size, mode='nearest') for i, x in enumerate(xs[:-1])]
+        out = torch.sum(torch.stack(res + xs[-1:]), dim=0)
+        return out
+#################
+class DetectMultiBackend(nn.Module):
+    # YOLO MultiBackend class for python inference on various backends
+    def __init__(self, weights='yolo.pt', device=torch.device('cpu'), dnn=False, data=None, fp16=False, fuse=True):
+        # Usage:
+        #   PyTorch:              weights = *.pt
+        #   TorchScript:                    *.torchscript
+        #   ONNX Runtime:                   *.onnx
+        #   ONNX OpenCV DNN:                *.onnx --dnn
+        #   OpenVINO:                       *_openvino_model
+        #   CoreML:                         *.mlmodel
+        #   TensorRT:                       *.engine
+        #   TensorFlow SavedModel:          *_saved_model
+        #   TensorFlow GraphDef:            *.pb
+        #   TensorFlow Lite:                *.tflite
+        #   TensorFlow Edge TPU:            *_edgetpu.tflite
+        #   PaddlePaddle:                   *_paddle_model
+        from models.experimental import attempt_download, attempt_load  # scoped to avoid circular import
+        super().__init__()
+        w = str(weights[0] if isinstance(weights, list) else weights)
+        pt, jit, onnx, onnx_end2end, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, triton = self._model_type(w)
+        fp16 &= pt or jit or onnx or engine  # FP16
+        nhwc = coreml or saved_model or pb or tflite or edgetpu  # BHWC formats (vs torch BCWH)
+        stride = 32  # default stride
+        cuda = torch.cuda.is_available() and device.type != 'cpu'  # use CUDA
+        if not (pt or triton):
+            w = attempt_download(w)  # download if not local
+        if pt:  # PyTorch
+            model = attempt_load(weights if isinstance(weights, list) else w, device=device, inplace=True, fuse=fuse)
+            stride = max(int(model.stride.max()), 32)  # model stride
+            names = model.module.names if hasattr(model, 'module') else model.names  # get class names
+            model.half() if fp16 else model.float()
+            self.model = model  # explicitly assign for to(), cpu(), cuda(), half()
+        elif jit:  # TorchScript
+            LOGGER.info(f'Loading {w} for TorchScript inference...')
+            extra_files = {'config.txt': ''}  # model metadata
+            model = torch.jit.load(w, _extra_files=extra_files, map_location=device)
+            model.half() if fp16 else model.float()
+            if extra_files['config.txt']:  # load metadata dict
+                d = json.loads(extra_files['config.txt'],
+                               object_hook=lambda d: {int(k) if k.isdigit() else k: v
+                                                      for k, v in d.items()})
+                stride, names = int(d['stride']), d['names']
+        elif dnn:  # ONNX OpenCV DNN
+            LOGGER.info(f'Loading {w} for ONNX OpenCV DNN inference...')
+            check_requirements('opencv-python>=4.5.4')
+            net = cv2.dnn.readNetFromONNX(w)
+        elif onnx:  # ONNX Runtime
+            LOGGER.info(f'Loading {w} for ONNX Runtime inference...')
+            check_requirements(('onnx', 'onnxruntime-gpu' if cuda else 'onnxruntime'))
+            import onnxruntime
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if cuda else ['CPUExecutionProvider']
+            session = onnxruntime.InferenceSession(w, providers=providers)
+            output_names = [x.name for x in session.get_outputs()]
+            meta = session.get_modelmeta().custom_metadata_map  # metadata
+            if 'stride' in meta:
+                stride, names = int(meta['stride']), eval(meta['names'])
+        elif xml:  # OpenVINO
+            LOGGER.info(f'Loading {w} for OpenVINO inference...')
+            check_requirements('openvino')  # requires openvino-dev: https://pypi.org/project/openvino-dev/
+            from openvino.runtime import Core, Layout, get_batch
+            ie = Core()
+            if not Path(w).is_file():  # if not *.xml
+                w = next(Path(w).glob('*.xml'))  # get *.xml file from *_openvino_model dir
+            network = ie.read_model(model=w, weights=Path(w).with_suffix('.bin'))
+            if network.get_parameters()[0].get_layout().empty:
+                network.get_parameters()[0].set_layout(Layout("NCHW"))
+            batch_dim = get_batch(network)
+            if batch_dim.is_static:
+                batch_size = batch_dim.get_length()
+            executable_network = ie.compile_model(network, device_name="CPU")  # device_name="MYRIAD" for Intel NCS2
+            stride, names = self._load_metadata(Path(w).with_suffix('.yaml'))  # load metadata
+        elif engine:  # TensorRT
+            LOGGER.info(f'Loading {w} for TensorRT inference...')
+            import tensorrt as trt  # https://developer.nvidia.com/nvidia-tensorrt-download
+            check_version(trt.__version__, '7.0.0', hard=True)  # require tensorrt>=7.0.0
+            if device.type == 'cpu':
+                device = torch.device('cuda:0')
+            Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+            logger = trt.Logger(trt.Logger.INFO)
+            with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
+                model = runtime.deserialize_cuda_engine(f.read())
+            context = model.create_execution_context()
+            bindings = OrderedDict()
+            output_names = []
+            fp16 = False  # default updated below
+            dynamic = False
+            for i in range(model.num_bindings):
+                name = model.get_binding_name(i)
+                dtype = trt.nptype(model.get_binding_dtype(i))
+                if model.binding_is_input(i):
+                    if -1 in tuple(model.get_binding_shape(i)):  # dynamic
+                        dynamic = True
+                        context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2]))
+                    if dtype == np.float16:
+                        fp16 = True
+                else:  # output
+                    output_names.append(name)
+                shape = tuple(context.get_binding_shape(i))
+                im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
+                bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))
+            binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
+            batch_size = bindings['images'].shape[0]  # if dynamic, this is instead max batch size
+        elif coreml:  # CoreML
+            LOGGER.info(f'Loading {w} for CoreML inference...')
+            import coremltools as ct
+            model = ct.models.MLModel(w)
+        elif saved_model:  # TF SavedModel
+            LOGGER.info(f'Loading {w} for TensorFlow SavedModel inference...')
+            import tensorflow as tf
+            keras = False  # assume TF1 saved_model
+            model = tf.keras.models.load_model(w) if keras else tf.saved_model.load(w)
+        elif pb:  # GraphDef https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxt
+            LOGGER.info(f'Loading {w} for TensorFlow GraphDef inference...')
+            import tensorflow as tf
+            def wrap_frozen_graph(gd, inputs, outputs):
+                x = tf.compat.v1.wrap_function(lambda: tf.compat.v1.import_graph_def(gd, name=""), [])  # wrapped
+                ge = x.graph.as_graph_element
+                return x.prune(tf.nest.map_structure(ge, inputs), tf.nest.map_structure(ge, outputs))
+            def gd_outputs(gd):
+                name_list, input_list = [], []
+                for node in gd.node:  # tensorflow.core.framework.node_def_pb2.NodeDef
+                    name_list.append(node.name)
+                    input_list.extend(node.input)
+                return sorted(f'{x}:0' for x in list(set(name_list) - set(input_list)) if not x.startswith('NoOp'))
+            gd = tf.Graph().as_graph_def()  # TF GraphDef
+            with open(w, 'rb') as f:
+                gd.ParseFromString(f.read())
+            frozen_func = wrap_frozen_graph(gd, inputs="x:0", outputs=gd_outputs(gd))
+        elif tflite or edgetpu:  # https://www.tensorflow.org/lite/guide/python#install_tensorflow_lite_for_python
+            try:  # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpu
+                from tflite_runtime.interpreter import Interpreter, load_delegate
+            except ImportError:
+                import tensorflow as tf
+                Interpreter, load_delegate = tf.lite.Interpreter, tf.lite.experimental.load_delegate,
+            if edgetpu:  # TF Edge TPU https://coral.ai/software/#edgetpu-runtime
+                LOGGER.info(f'Loading {w} for TensorFlow Lite Edge TPU inference...')
+                delegate = {
+                    'Linux': 'libedgetpu.so.1',
+                    'Darwin': 'libedgetpu.1.dylib',
+                    'Windows': 'edgetpu.dll'}[platform.system()]
+                interpreter = Interpreter(model_path=w, experimental_delegates=[load_delegate(delegate)])
+            else:  # TFLite
+                LOGGER.info(f'Loading {w} for TensorFlow Lite inference...')
+                interpreter = Interpreter(model_path=w)  # load TFLite model
+            interpreter.allocate_tensors()  # allocate
+            input_details = interpreter.get_input_details()  # inputs
+            output_details = interpreter.get_output_details()  # outputs
+            # load metadata
+            with contextlib.suppress(zipfile.BadZipFile):
+                with zipfile.ZipFile(w, "r") as model:
+                    meta_file = model.namelist()[0]
+                    meta = ast.literal_eval(model.read(meta_file).decode("utf-8"))
+                    stride, names = int(meta['stride']), meta['names']
+        elif tfjs:  # TF.js
+            raise NotImplementedError('ERROR: YOLO TF.js inference is not supported')
+        elif paddle:  # PaddlePaddle
+            LOGGER.info(f'Loading {w} for PaddlePaddle inference...')
+            check_requirements('paddlepaddle-gpu' if cuda else 'paddlepaddle')
+            import paddle.inference as pdi
+            if not Path(w).is_file():  # if not *.pdmodel
+                w = next(Path(w).rglob('*.pdmodel'))  # get *.pdmodel file from *_paddle_model dir
+            weights = Path(w).with_suffix('.pdiparams')
+            config = pdi.Config(str(w), str(weights))
+            if cuda:
+                config.enable_use_gpu(memory_pool_init_size_mb=2048, device_id=0)
+            predictor = pdi.create_predictor(config)
+            input_handle = predictor.get_input_handle(predictor.get_input_names()[0])
+            output_names = predictor.get_output_names()
+        elif triton:  # NVIDIA Triton Inference Server
+            LOGGER.info(f'Using {w} as Triton Inference Server...')
+            check_requirements('tritonclient[all]')
+            from utils.triton import TritonRemoteModel
+            model = TritonRemoteModel(url=w)
+            nhwc = model.runtime.startswith("tensorflow")
+        else:
+            raise NotImplementedError(f'ERROR: {w} is not a supported format')
+        # class names
+        if 'names' not in locals():
+            names = yaml_load(data)['names'] if data else {i: f'class{i}' for i in range(999)}
+        if names[0] == 'n01440764' and len(names) == 1000:  # ImageNet
+            names = yaml_load(ROOT / 'data/ImageNet.yaml')['names']  # human-readable names
+        self.__dict__.update(locals())  # assign all variables to self
+    def forward(self, im, augment=False, visualize=False):
+        # YOLO MultiBackend inference
+        b, ch, h, w = im.shape  # batch, channel, height, width
+        if self.fp16 and im.dtype != torch.float16:
+            im = im.half()  # to FP16
+        if self.nhwc:
+            im = im.permute(0, 2, 3, 1)  # torch BCHW to numpy BHWC shape(1,320,192,3)
+        if self.pt:  # PyTorch
+            y = self.model(im, augment=augment, visualize=visualize) if augment or visualize else self.model(im)
+        elif self.jit:  # TorchScript
+            y = self.model(im)
+        elif self.dnn:  # ONNX OpenCV DNN
+            im = im.cpu().numpy()  # torch to numpy
+            self.net.setInput(im)
+            y = self.net.forward()
+        elif self.onnx:  # ONNX Runtime
+            im = im.cpu().numpy()  # torch to numpy
+            y = self.session.run(self.output_names, {self.session.get_inputs()[0].name: im})
+        elif self.xml:  # OpenVINO
+            im = im.cpu().numpy()  # FP32
+            y = list(self.executable_network([im]).values())
+        elif self.engine:  # TensorRT
+            if self.dynamic and im.shape != self.bindings['images'].shape:
+                i = self.model.get_binding_index('images')
+                self.context.set_binding_shape(i, im.shape)  # reshape if dynamic
+                self.bindings['images'] = self.bindings['images']._replace(shape=im.shape)
+                for name in self.output_names:
+                    i = self.model.get_binding_index(name)
+                    self.bindings[name].data.resize_(tuple(self.context.get_binding_shape(i)))
+            s = self.bindings['images'].shape
+            assert im.shape == s, f"input size {im.shape} {'>' if self.dynamic else 'not equal to'} max model size {s}"
+            self.binding_addrs['images'] = int(im.data_ptr())
+            self.context.execute_v2(list(self.binding_addrs.values()))
+            y = [self.bindings[x].data for x in sorted(self.output_names)]
+        elif self.coreml:  # CoreML
+            im = im.cpu().numpy()
+            im = Image.fromarray((im[0] * 255).astype('uint8'))
+            # im = im.resize((192, 320), Image.ANTIALIAS)
+            y = self.model.predict({'image': im})  # coordinates are xywh normalized
+            if 'confidence' in y:
+                box = xywh2xyxy(y['coordinates'] * [[w, h, w, h]])  # xyxy pixels
+                conf, cls = y['confidence'].max(1), y['confidence'].argmax(1).astype(np.float)
+                y = np.concatenate((box, conf.reshape(-1, 1), cls.reshape(-1, 1)), 1)
+            else:
+                y = list(reversed(y.values()))  # reversed for segmentation models (pred, proto)
+        elif self.paddle:  # PaddlePaddle
+            im = im.cpu().numpy().astype(np.float32)
+            self.input_handle.copy_from_cpu(im)
+            self.predictor.run()
+            y = [self.predictor.get_output_handle(x).copy_to_cpu() for x in self.output_names]
+        elif self.triton:  # NVIDIA Triton Inference Server
+            y = self.model(im)
+        else:  # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU)
+            im = im.cpu().numpy()
+            if self.saved_model:  # SavedModel
+                y = self.model(im, training=False) if self.keras else self.model(im)
+            elif self.pb:  # GraphDef
+                y = self.frozen_func(x=self.tf.constant(im))
+            else:  # Lite or Edge TPU
+                input = self.input_details[0]
+                int8 = input['dtype'] == np.uint8  # is TFLite quantized uint8 model
+                if int8:
+                    scale, zero_point = input['quantization']
+                    im = (im / scale + zero_point).astype(np.uint8)  # de-scale
+                self.interpreter.set_tensor(input['index'], im)
+                self.interpreter.invoke()
+                y = []
+                for output in self.output_details:
+                    x = self.interpreter.get_tensor(output['index'])
+                    if int8:
+                        scale, zero_point = output['quantization']
+                        x = (x.astype(np.float32) - zero_point) * scale  # re-scale
+                    y.append(x)
+            y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y]
+            y[0][..., :4] *= [w, h, w, h]  # xywh normalized to pixels
+        if isinstance(y, (list, tuple)):
+            return self.from_numpy(y[0]) if len(y) == 1 else [self.from_numpy(x) for x in y]
+        else:
+            return self.from_numpy(y)
+    def from_numpy(self, x):
+        return torch.from_numpy(x).to(self.device) if isinstance(x, np.ndarray) else x
+    def warmup(self, imgsz=(1, 3, 640, 640)):
+        # Warmup model by running inference once
+        warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton
+        if any(warmup_types) and (self.device.type != 'cpu' or self.triton):
+            im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device)  # input
+            for _ in range(2 if self.jit else 1):  #
+                self.forward(im)  # warmup
+    @staticmethod
+    def _model_type(p='path/to/model.pt'):
+        # Return model type from model path, i.e. path='path/to/model.onnx' -> type=onnx
+        # types = [pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle]
+        from export import export_formats
+        from utils.downloads import is_url
+        sf = list(export_formats().Suffix)  # export suffixes
+        if not is_url(p, check=False):
+            check_suffix(p, sf)  # checks
+        url = urlparse(p)  # if url may be Triton inference server
+        types = [s in Path(p).name for s in sf]
+        types[8] &= not types[9]  # tflite &= not edgetpu
+        triton = not any(types) and all([any(s in url.scheme for s in ["http", "grpc"]), url.netloc])
+        return types + [triton]
+    @staticmethod
+    def _load_metadata(f=Path('path/to/meta.yaml')):
+        # Load metadata from meta.yaml if it exists
+        if f.exists():
+            d = yaml_load(f)
+            return d['stride'], d['names']  # assign stride, names
+        return None, None
+class AutoShape(nn.Module):
+    # YOLO input-robust model wrapper for passing cv2/np/PIL/torch inputs. Includes preprocessing, inference and NMS
+    conf = 0.25  # NMS confidence threshold
+    iou = 0.45  # NMS IoU threshold
+    agnostic = False  # NMS class-agnostic
+    multi_label = False  # NMS multiple labels per box
+    classes = None  # (optional list) filter by class, i.e. = [0, 15, 16] for COCO persons, cats and dogs
+    max_det = 1000  # maximum number of detections per image
+    amp = False  # Automatic Mixed Precision (AMP) inference
+    def __init__(self, model, verbose=True):
+        super().__init__()
+        if verbose:
+            LOGGER.info('Adding AutoShape... ')
+        copy_attr(self, model, include=('yaml', 'nc', 'hyp', 'names', 'stride', 'abc'), exclude=())  # copy attributes
+        self.dmb = isinstance(model, DetectMultiBackend)  # DetectMultiBackend() instance
+        self.pt = not self.dmb or model.pt  # PyTorch model
+        self.model = model.eval()
+        if self.pt:
+            m = self.model.model.model[-1] if self.dmb else self.model.model[-1]  # Detect()
+            m.inplace = False  # Detect.inplace=False for safe multithread inference
+            m.export = True  # do not output loss values
+    def _apply(self, fn):
+        # Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers
+        self = super()._apply(fn)
+        from models.yolo import Detect, Segment
+        if self.pt:
+            m = self.model.model.model[-1] if self.dmb else self.model.model[-1]  # Detect()
+            if isinstance(m, (Detect, Segment)):
+                for k in 'stride', 'anchor_grid', 'stride_grid', 'grid':
+                    x = getattr(m, k)
+                    setattr(m, k, list(map(fn, x))) if isinstance(x, (list, tuple)) else setattr(m, k, fn(x))
+        return self
+    @smart_inference_mode()
+    def forward(self, ims, size=640, augment=False, profile=False):
+        # Inference from various sources. For size(height=640, width=1280), RGB images example inputs are:
+        #   file:        ims = 'data/images/zidane.jpg'  # str or PosixPath
+        #   URI:             = 'https://ultralytics.com/images/zidane.jpg'
+        #   OpenCV:          = cv2.imread('image.jpg')[:,:,::-1]  # HWC BGR to RGB x(640,1280,3)
+        #   PIL:             = Image.open('image.jpg') or ImageGrab.grab()  # HWC x(640,1280,3)
+        #   numpy:           = np.zeros((640,1280,3))  # HWC
+        #   torch:           = torch.zeros(16,3,320,640)  # BCHW (scaled to size=640, 0-1 values)
+        #   multiple:        = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...]  # list of images
+        dt = (Profile(), Profile(), Profile())
+        with dt[0]:
+            if isinstance(size, int):  # expand
+                size = (size, size)
+            p = next(self.model.parameters()) if self.pt else torch.empty(1, device=self.model.device)  # param
+            autocast = self.amp and (p.device.type != 'cpu')  # Automatic Mixed Precision (AMP) inference
+            if isinstance(ims, torch.Tensor):  # torch
+                with amp.autocast(autocast):
+                    return self.model(ims.to(p.device).type_as(p), augment=augment)  # inference
+            # Pre-process
+            n, ims = (len(ims), list(ims)) if isinstance(ims, (list, tuple)) else (1, [ims])  # number, list of images
+            shape0, shape1, files = [], [], []  # image and inference shapes, filenames
+            for i, im in enumerate(ims):
+                f = f'image{i}'  # filename
+                if isinstance(im, (str, Path)):  # filename or uri
+                    im, f = Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im), im
+                    im = np.asarray(exif_transpose(im))
+                elif isinstance(im, Image.Image):  # PIL Image
+                    im, f = np.asarray(exif_transpose(im)), getattr(im, 'filename', f) or f
+                files.append(Path(f).with_suffix('.jpg').name)
+                if im.shape[0] < 5:  # image in CHW
+                    im = im.transpose((1, 2, 0))  # reverse dataloader .transpose(2, 0, 1)
+                im = im[..., :3] if im.ndim == 3 else cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)  # enforce 3ch input
+                s = im.shape[:2]  # HWC
+                shape0.append(s)  # image shape
+                g = max(size) / max(s)  # gain
+                shape1.append([int(y * g) for y in s])
+                ims[i] = im if im.data.contiguous else np.ascontiguousarray(im)  # update
+            shape1 = [make_divisible(x, self.stride) for x in np.array(shape1).max(0)]  # inf shape
+            x = [letterbox(im, shape1, auto=False)[0] for im in ims]  # pad
+            x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2)))  # stack and BHWC to BCHW
+            x = torch.from_numpy(x).to(p.device).type_as(p) / 255  # uint8 to fp16/32
+        with amp.autocast(autocast):
+            # Inference
+            with dt[1]:
+                y = self.model(x, augment=augment)  # forward
+            # Post-process
+            with dt[2]:
+                y = non_max_suppression(y if self.dmb else y[0],
+                                        self.conf,
+                                        self.iou,
+                                        self.classes,
+                                        self.agnostic,
+                                        self.multi_label,
+                                        max_det=self.max_det)  # NMS
+                for i in range(n):
+                    scale_boxes(shape1, y[i][:, :4], shape0[i])
+            return Detections(ims, y, files, dt, self.names, x.shape)
+class Detections:
+    # YOLO detections class for inference results
+    def __init__(self, ims, pred, files, times=(0, 0, 0), names=None, shape=None):
+        super().__init__()
+        d = pred[0].device  # device
+        gn = [torch.tensor([*(im.shape[i] for i in [1, 0, 1, 0]), 1, 1], device=d) for im in ims]  # normalizations
+        self.ims = ims  # list of images as numpy arrays
+        self.pred = pred  # list of tensors pred[0] = (xyxy, conf, cls)
+        self.names = names  # class names
+        self.files = files  # image filenames
+        self.times = times  # profiling times
+        self.xyxy = pred  # xyxy pixels
+        self.xywh = [xyxy2xywh(x) for x in pred]  # xywh pixels
+        self.xyxyn = [x / g for x, g in zip(self.xyxy, gn)]  # xyxy normalized
+        self.xywhn = [x / g for x, g in zip(self.xywh, gn)]  # xywh normalized
+        self.n = len(self.pred)  # number of images (batch size)
+        self.t = tuple(x.t / self.n * 1E3 for x in times)  # timestamps (ms)
+        self.s = tuple(shape)  # inference BCHW shape
+    def _run(self, pprint=False, show=False, save=False, crop=False, render=False, labels=True, save_dir=Path('')):
+        s, crops = '', []
+        for i, (im, pred) in enumerate(zip(self.ims, self.pred)):
+            s += f'\nimage {i + 1}/{len(self.pred)}: {im.shape[0]}x{im.shape[1]} '  # string
+            if pred.shape[0]:
+                for c in pred[:, -1].unique():
+                    n = (pred[:, -1] == c).sum()  # detections per class
+                    s += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, "  # add to string
+                s = s.rstrip(', ')
+                if show or save or render or crop:
+                    annotator = Annotator(im, example=str(self.names))
+                    for *box, conf, cls in reversed(pred):  # xyxy, confidence, class
+                        label = f'{self.names[int(cls)]} {conf:.2f}'
+                        if crop:
+                            file = save_dir / 'crops' / self.names[int(cls)] / self.files[i] if save else None
+                            crops.append({
+                                'box': box,
+                                'conf': conf,
+                                'cls': cls,
+                                'label': label,
+                                'im': save_one_box(box, im, file=file, save=save)})
+                        else:  # all others
+                            annotator.box_label(box, label if labels else '', color=colors(cls))
+                    im = annotator.im
+            else:
+                s += '(no detections)'
+            im = Image.fromarray(im.astype(np.uint8)) if isinstance(im, np.ndarray) else im  # from np
+            if show:
+                display(im) if is_notebook() else im.show(self.files[i])
+            if save:
+                f = self.files[i]
+                im.save(save_dir / f)  # save
+                if i == self.n - 1:
+                    LOGGER.info(f"Saved {self.n} image{'s' * (self.n > 1)} to {colorstr('bold', save_dir)}")
+            if render:
+                self.ims[i] = np.asarray(im)
+        if pprint:
+            s = s.lstrip('\n')
+            return f'{s}\nSpeed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {self.s}' % self.t
+        if crop:
+            if save:
+                LOGGER.info(f'Saved results to {save_dir}\n')
+            return crops
+    @TryExcept('Showing images is not supported in this environment')
+    def show(self, labels=True):
+        self._run(show=True, labels=labels)  # show results
+    def save(self, labels=True, save_dir='runs/detect/exp', exist_ok=False):
+        save_dir = increment_path(save_dir, exist_ok, mkdir=True)  # increment save_dir
+        self._run(save=True, labels=labels, save_dir=save_dir)  # save results
+    def crop(self, save=True, save_dir='runs/detect/exp', exist_ok=False):
+        save_dir = increment_path(save_dir, exist_ok, mkdir=True) if save else None
+        return self._run(crop=True, save=save, save_dir=save_dir)  # crop results
+    def render(self, labels=True):
+        self._run(render=True, labels=labels)  # render results
+        return self.ims
+    def pandas(self):
+        # return detections as pandas DataFrames, i.e. print(results.pandas().xyxy[0])
+        new = copy(self)  # return copy
+        ca = 'xmin', 'ymin', 'xmax', 'ymax', 'confidence', 'class', 'name'  # xyxy columns
+        cb = 'xcenter', 'ycenter', 'width', 'height', 'confidence', 'class', 'name'  # xywh columns
+        for k, c in zip(['xyxy', 'xyxyn', 'xywh', 'xywhn'], [ca, ca, cb, cb]):
+            a = [[x[:5] + [int(x[5]), self.names[int(x[5])]] for x in x.tolist()] for x in getattr(self, k)]  # update
+            setattr(new, k, [pd.DataFrame(x, columns=c) for x in a])
+        return new
+    def tolist(self):
+        # return a list of Detections objects, i.e. 'for result in results.tolist():'
+        r = range(self.n)  # iterable
+        x = [Detections([self.ims[i]], [self.pred[i]], [self.files[i]], self.times, self.names, self.s) for i in r]
+        # for d in x:
+        #    for k in ['ims', 'pred', 'xyxy', 'xyxyn', 'xywh', 'xywhn']:
+        #        setattr(d, k, getattr(d, k)[0])  # pop out of list
+        return x
+    def print(self):
+        LOGGER.info(self.__str__())
+    def __len__(self):  # override len(results)
+        return self.n
+    def __str__(self):  # override print(results)
+        return self._run(pprint=True)  # print results
+    def __repr__(self):
+        return f'YOLO {self.__class__} instance\n' + self.__str__()
+class Proto(nn.Module):
+    # YOLO mask Proto module for segmentation models
+    def __init__(self, c1, c_=256, c2=32):  # ch_in, number of protos, number of masks
+        super().__init__()
+        self.cv1 = Conv(c1, c_, k=3)
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.cv2 = Conv(c_, c_, k=3)
+        self.cv3 = Conv(c_, c2)
+    def forward(self, x):
+        return self.cv3(self.cv2(self.upsample(self.cv1(x))))
+class UConv(nn.Module):
+    def __init__(self, c1, c_=256, c2=256):  # ch_in, number of protos, number of masks
+        super().__init__()
+        self.cv1 = Conv(c1, c_, k=3)
+        self.cv2 = nn.Conv2d(c_, c2, 1, 1)
+        self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+    def forward(self, x):
+        return self.up(self.cv2(self.cv1(x)))
+class Classify(nn.Module):
+    # YOLO classification head, i.e. x(b,c1,20,20) to x(b,c2)
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1):  # ch_in, ch_out, kernel, stride, padding, groups
+        super().__init__()
+        c_ = 1280  # efficientnet_b0 size
+        self.conv = Conv(c1, c_, k, s, autopad(k, p), g)
+        self.pool = nn.AdaptiveAvgPool2d(1)  # to x(b,c_,1,1)
+        self.drop = nn.Dropout(p=0.0, inplace=True)
+        self.linear = nn.Linear(c_, c2)  # to x(b,c2)
+    def forward(self, x):
+        if isinstance(x, list):
+            x = torch.cat(x, 1)
+        return self.linear(self.drop(self.pool(self.conv(x)).flatten(1)))

models/detect/pk-yolo.yaml ADDED Viewed

	@@ -0,0 +1,126 @@

+# YOLOv9
+# parameters
+nc: 2  # number of classes
+depth_multiple: 1.0  # model depth multiple
+width_multiple: 1.0  # layer channel multiple
+# activation: nn.LeakyReLU(0.1)
+# activation: nn.ReLU()
+# anchors
+anchors: 3
+# YOLOv9 backbone
+backbone:
+  [
+   [-1, 1, Silence, []],
+   [-1, 1, Backbone, []],
+   # conv down
+   [1, 1, Down0, [64]], #2 320   1
+   [1, 1, Down1, [128]], # 3  160   3
+    [1, 1, Down2, [256]],# 4  80    5
+    [1, 1, Down3, [512]], #5    40  7
+    [1, 1, Down4, [1024]], #6    20     9
+    # routing
+   [ 2, 1, CBLinear, [ [ 64 ] ] ], # 10
+   [ 3, 1, CBLinear, [ [ 64, 128 ] ] ], # 11
+   [ 4, 1, CBLinear, [ [ 64, 128, 256 ] ] ], # 12
+   [ 5, 1, CBLinear, [ [ 64, 128, 256, 512 ] ] ], # 13
+   [ 6, 1, CBLinear, [ [ 64, 128, 256, 512, 1024 ] ] ], # 14   -3
+    # conv down fuse
+   [ 0, 1, Conv, [ 64, 3, 2 ] ],  # 15-P1/2
+   [ [ 7, 8, 9, 10, 11, -1 ], 1, CBFuse, [ [ 0, 0, 0, 0, 0 ] ] ], # 16
+    # conv down fuse
+   [ -1, 1, Conv, [ 128, 3, 2 ] ],  # 17-P2/4
+   [ [ 8, 9, 10, 11, -1 ], 1, CBFuse, [ [ 1, 1, 1, 1 ] ] ], # 18
+    # elan-1 block
+   [ -1, 1, RepNCSPELAN4, [ 256, 128, 64, 2 ] ],  # 19
+    # avg-conv down fuse
+   [ -1, 1, ADown, [ 256 ] ],  # 20-P3/8
+   [ [  9, 10, 11, -1 ], 1, CBFuse, [ [ 2, 2, 2 ] ] ], # 21
+    # elan-2 block
+   [ -1, 1, RepNCSPELAN4, [ 512, 256, 128, 2 ] ],  # 22
+    # avg-conv down fuse
+   [ -1, 1, ADown, [ 512 ] ],  # 23-P4/16
+   [ [ 10, 11, -1 ], 1, CBFuse, [ [ 3, 3 ] ] ], # 24
+    # elan-2 block
+   [ -1, 1, RepNCSPELAN4, [ 1024, 512, 256, 2 ] ],  # 25
+    # avg-conv down fuse
+   [ -1, 1, ADown, [ 1024 ] ],  # 26-P5/32
+   [ [ 11, -1 ], 1, CBFuse, [ [ 4 ] ] ], # 27
+    # elan-2 block
+   [ -1, 1, RepNCSPELAN4, [ 1024, 512, 256, 2 ] ],  # 28   25
+  ]
+# YOLOv9 head
+head:
+  [
+   # multi-level auxiliary branch
+   # elan-spp block
+   [6, 1, SPPELAN, [512, 256]],  # 29
+   # up-concat merge
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 5], 1, Concat, [1]],  # cat backbone P4
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 32
+   # up-concat merge
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]],  # 35
+   # main branch
+   # elan-spp block
+   [25, 1, SPPELAN, [512, 256]],  # 36
+   # up-concat merge
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 22], 1, Concat, [1]],  # cat backbone P4
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 39
+   # up-concat merge
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 19], 1, Concat, [1]],  # cat backbone P3
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]],  # 42 (P3/8-small)
+   # avg-conv-down merge
+   [-1, 1, ADown, [256]],
+   [[-1, 36], 1, Concat, [1]],  # cat head P4
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 45 (P4/16-medium)
+   # avg-conv-down merge
+   [-1, 1, ADown, [512]],
+   [[-1, 33], 1, Concat, [1]],  # cat head P5
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [512, 1024, 512, 2]],  # 48 (P5/32-large)
+   # detect
+   [[32, 29, 26, 39, 42, 45], 1, DualDDetect, [nc]],  # DualDDetect(A3, A4, A5, P3, P4, P5)
+  ]

models/detect/yolov9-e.yaml ADDED Viewed

	@@ -0,0 +1,144 @@

+# YOLOv9
+# parameters
+nc: 2  # number of classes
+depth_multiple: 1.0  # model depth multiple
+width_multiple: 1.0  # layer channel multiple
+#activation: nn.LeakyReLU(0.1)
+#activation: nn.ReLU()
+# anchors
+anchors: 3
+# YOLOv9 backbone
+backbone:
+  [
+   [-1, 1, Silence, []],
+   # conv down
+   [-1, 1, Conv, [64, 3, 2]],  # 1-P1/2
+   # conv down
+   [-1, 1, Conv, [128, 3, 2]],  # 2-P2/4
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]],  # 3
+   # avg-conv down
+   [-1, 1, ADown, [256]],  # 4-P3/8
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]],  # 5
+   # avg-conv down
+   [-1, 1, ADown, [512]],  # 6-P4/16
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 7
+   # avg-conv down
+   [-1, 1, ADown, [1024]],  # 8-P5/32
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 9
+   # routing
+   [1, 1, CBLinear, [[64]]], # 10
+   [3, 1, CBLinear, [[64, 128]]], # 11
+   [5, 1, CBLinear, [[64, 128, 256]]], # 12
+   [7, 1, CBLinear, [[64, 128, 256, 512]]], # 13
+   [9, 1, CBLinear, [[64, 128, 256, 512, 1024]]], # 14
+   # conv down
+   [0, 1, Conv, [64, 3, 2]],  # 15-P1/2
+   [[10, 11, 12, 13, 14, -1], 1, CBFuse, [[0, 0, 0, 0, 0]]], # 16
+   # conv down
+   [-1, 1, Conv, [128, 3, 2]],  # 17-P2/4
+   [[11, 12, 13, 14, -1], 1, CBFuse, [[1, 1, 1, 1]]], # 18
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]],  # 19
+   # avg-conv down fuse
+   [-1, 1, ADown, [256]],  # 20-P3/8
+   [[12, 13, 14, -1], 1, CBFuse, [[2, 2, 2]]], # 21
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]],  # 22
+   # avg-conv down fuse
+   [-1, 1, ADown, [512]],  # 23-P4/16
+   [[13, 14, -1], 1, CBFuse, [[3, 3]]], # 24
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 25
+   # avg-conv down fuse
+   [-1, 1, ADown, [1024]],  # 26-P5/32
+   [[14, -1], 1, CBFuse, [[4]]], # 27
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]],  # 28
+  ]
+# YOLOv9 head
+head:
+  [
+   # multi-level auxiliary branch
+   # elan-spp block
+   [9, 1, SPPELAN, [512, 256]],  # 29
+   # up-concat merge
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 7], 1, Concat, [1]],  # cat backbone P4
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 32
+   # up-concat merge
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 5], 1, Concat, [1]],  # cat backbone P3
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]],  # 35
+   # main branch
+   # elan-spp block
+   [28, 1, SPPELAN, [512, 256]],  # 36
+   # up-concat merge
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 25], 1, Concat, [1]],  # cat backbone P4
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 39
+   # up-concat merge
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 22], 1, Concat, [1]],  # cat backbone P3
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]],  # 42 (P3/8-small)
+   # avg-conv-down merge
+   [-1, 1, ADown, [256]],
+   [[-1, 39], 1, Concat, [1]],  # cat head P4
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]],  # 45 (P4/16-medium)
+   # avg-conv-down merge
+   [-1, 1, ADown, [512]],
+   [[-1, 36], 1, Concat, [1]],  # cat head P5
+   # csp-elan block
+   [-1, 1, RepNCSPELAN4, [512, 1024, 512, 2]],  # 48 (P5/32-large)
+   # detect
+   [[35, 32, 29, 42, 45, 48], 1, DualDDetect, [nc]],  # DualDDetect(A3, A4, A5, P3, P4, P5)
+  ]

models/experimental.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from utils.downloads import attempt_download
+class Sum(nn.Module):
+    # Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070
+    def __init__(self, n, weight=False):  # n: number of inputs
+        super().__init__()
+        self.weight = weight  # apply weights boolean
+        self.iter = range(n - 1)  # iter object
+        if weight:
+            self.w = nn.Parameter(-torch.arange(1.0, n) / 2, requires_grad=True)  # layer weights
+    def forward(self, x):
+        y = x[0]  # no weight
+        if self.weight:
+            w = torch.sigmoid(self.w) * 2
+            for i in self.iter:
+                y = y + x[i + 1] * w[i]
+        else:
+            for i in self.iter:
+                y = y + x[i + 1]
+        return y
+class MixConv2d(nn.Module):
+    # Mixed Depth-wise Conv https://arxiv.org/abs/1907.09595
+    def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True):  # ch_in, ch_out, kernel, stride, ch_strategy
+        super().__init__()
+        n = len(k)  # number of convolutions
+        if equal_ch:  # equal c_ per group
+            i = torch.linspace(0, n - 1E-6, c2).floor()  # c2 indices
+            c_ = [(i == g).sum() for g in range(n)]  # intermediate channels
+        else:  # equal weight.numel() per group
+            b = [c2] + [0] * n
+            a = np.eye(n + 1, n, k=-1)
+            a -= np.roll(a, 1, axis=1)
+            a *= np.array(k) ** 2
+            a[0] = 1
+            c_ = np.linalg.lstsq(a, b, rcond=None)[0].round()  # solve for equal weight indices, ax = b
+        self.m = nn.ModuleList([
+            nn.Conv2d(c1, int(c_), k, s, k // 2, groups=math.gcd(c1, int(c_)), bias=False) for k, c_ in zip(k, c_)])
+        self.bn = nn.BatchNorm2d(c2)
+        self.act = nn.SiLU()
+    def forward(self, x):
+        return self.act(self.bn(torch.cat([m(x) for m in self.m], 1)))
+class Ensemble(nn.ModuleList):
+    # Ensemble of models
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, augment=False, profile=False, visualize=False):
+        y = [module(x, augment, profile, visualize)[0] for module in self]
+        # y = torch.stack(y).max(0)[0]  # max ensemble
+        # y = torch.stack(y).mean(0)  # mean ensemble
+        y = torch.cat(y, 1)  # nms ensemble
+        return y, None  # inference, train output
+class ORT_NMS(torch.autograd.Function):
+    '''ONNX-Runtime NMS operation'''
+    @staticmethod
+    def forward(ctx,
+                boxes,
+                scores,
+                max_output_boxes_per_class=torch.tensor([100]),
+                iou_threshold=torch.tensor([0.45]),
+                score_threshold=torch.tensor([0.25])):
+        device = boxes.device
+        batch = scores.shape[0]
+        num_det = random.randint(0, 100)
+        batches = torch.randint(0, batch, (num_det,)).sort()[0].to(device)
+        idxs = torch.arange(100, 100 + num_det).to(device)
+        zeros = torch.zeros((num_det,), dtype=torch.int64).to(device)
+        selected_indices = torch.cat([batches[None], zeros[None], idxs[None]], 0).T.contiguous()
+        selected_indices = selected_indices.to(torch.int64)
+        return selected_indices
+    @staticmethod
+    def symbolic(g, boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold):
+        return g.op("NonMaxSuppression", boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold)
+class TRT_NMS(torch.autograd.Function):
+    '''TensorRT NMS operation'''
+    @staticmethod
+    def forward(
+        ctx,
+        boxes,
+        scores,
+        background_class=-1,
+        box_coding=1,
+        iou_threshold=0.45,
+        max_output_boxes=100,
+        plugin_version="1",
+        score_activation=0,
+        score_threshold=0.25,
+    ):
+        batch_size, num_boxes, num_classes = scores.shape
+        num_det = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
+        det_boxes = torch.randn(batch_size, max_output_boxes, 4)
+        det_scores = torch.randn(batch_size, max_output_boxes)
+        det_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
+        return num_det, det_boxes, det_scores, det_classes
+    @staticmethod
+    def symbolic(g,
+                 boxes,
+                 scores,
+                 background_class=-1,
+                 box_coding=1,
+                 iou_threshold=0.45,
+                 max_output_boxes=100,
+                 plugin_version="1",
+                 score_activation=0,
+                 score_threshold=0.25):
+        out = g.op("TRT::EfficientNMS_TRT",
+                   boxes,
+                   scores,
+                   background_class_i=background_class,
+                   box_coding_i=box_coding,
+                   iou_threshold_f=iou_threshold,
+                   max_output_boxes_i=max_output_boxes,
+                   plugin_version_s=plugin_version,
+                   score_activation_i=score_activation,
+                   score_threshold_f=score_threshold,
+                   outputs=4)
+        nums, boxes, scores, classes = out
+        return nums, boxes, scores, classes
+class ONNX_ORT(nn.Module):
+    '''onnx module with ONNX-Runtime NMS operation.'''
+    def __init__(self, max_obj=100, iou_thres=0.45, score_thres=0.25, max_wh=640, device=None, n_classes=80):
+        super().__init__()
+        self.device = device if device else torch.device("cpu")
+        self.max_obj = torch.tensor([max_obj]).to(device)
+        self.iou_threshold = torch.tensor([iou_thres]).to(device)
+        self.score_threshold = torch.tensor([score_thres]).to(device)
+        self.max_wh = max_wh # if max_wh != 0 : non-agnostic else : agnostic
+        self.convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]],
+                                           dtype=torch.float32,
+                                           device=self.device)
+        self.n_classes=n_classes
+    def forward(self, x):
+        ## https://github.com/thaitc-hust/yolov9-tensorrt/blob/main/torch2onnx.py
+        ## thanks https://github.com/thaitc-hust
+        if isinstance(x, list):  ## yolov9-c.pt and yolov9-e.pt return list
+            x = x[1]
+        x = x.permute(0, 2, 1)
+        bboxes_x = x[..., 0:1]
+        bboxes_y = x[..., 1:2]
+        bboxes_w = x[..., 2:3]
+        bboxes_h = x[..., 3:4]
+        bboxes = torch.cat([bboxes_x, bboxes_y, bboxes_w, bboxes_h], dim = -1)
+        bboxes = bboxes.unsqueeze(2) # [n_batch, n_bboxes, 4] -> [n_batch, n_bboxes, 1, 4]
+        obj_conf = x[..., 4:]
+        scores = obj_conf
+        bboxes @= self.convert_matrix
+        max_score, category_id = scores.max(2, keepdim=True)
+        dis = category_id.float() * self.max_wh
+        nmsbox = bboxes + dis
+        max_score_tp = max_score.transpose(1, 2).contiguous()
+        selected_indices = ORT_NMS.apply(nmsbox, max_score_tp, self.max_obj, self.iou_threshold, self.score_threshold)
+        X, Y = selected_indices[:, 0], selected_indices[:, 2]
+        selected_boxes = bboxes[X, Y, :]
+        selected_categories = category_id[X, Y, :].float()
+        selected_scores = max_score[X, Y, :]
+        X = X.unsqueeze(1).float()
+        return torch.cat([X, selected_boxes, selected_categories, selected_scores], 1)
+class ONNX_TRT(nn.Module):
+    '''onnx module with TensorRT NMS operation.'''
+    def __init__(self, max_obj=100, iou_thres=0.45, score_thres=0.25, max_wh=None ,device=None, n_classes=80):
+        super().__init__()
+        assert max_wh is None
+        self.device = device if device else torch.device('cpu')
+        self.background_class = -1,
+        self.box_coding = 1,
+        self.iou_threshold = iou_thres
+        self.max_obj = max_obj
+        self.plugin_version = '1'
+        self.score_activation = 0
+        self.score_threshold = score_thres
+        self.n_classes=n_classes
+    def forward(self, x):
+        ## https://github.com/thaitc-hust/yolov9-tensorrt/blob/main/torch2onnx.py
+        ## thanks https://github.com/thaitc-hust
+        if isinstance(x, list):  ## yolov9-c.pt and yolov9-e.pt return list
+            x = x[1]
+        x = x.permute(0, 2, 1)
+        bboxes_x = x[..., 0:1]
+        bboxes_y = x[..., 1:2]
+        bboxes_w = x[..., 2:3]
+        bboxes_h = x[..., 3:4]
+        bboxes = torch.cat([bboxes_x, bboxes_y, bboxes_w, bboxes_h], dim = -1)
+        bboxes = bboxes.unsqueeze(2) # [n_batch, n_bboxes, 4] -> [n_batch, n_bboxes, 1, 4]
+        obj_conf = x[..., 4:]
+        scores = obj_conf
+        num_det, det_boxes, det_scores, det_classes = TRT_NMS.apply(bboxes, scores, self.background_class, self.box_coding,
+                                                                    self.iou_threshold, self.max_obj,
+                                                                    self.plugin_version, self.score_activation,
+                                                                    self.score_threshold)
+        return num_det, det_boxes, det_scores, det_classes
+class End2End(nn.Module):
+    '''export onnx or tensorrt model with NMS operation.'''
+    def __init__(self, model, max_obj=100, iou_thres=0.45, score_thres=0.25, max_wh=None, device=None, n_classes=80):
+        super().__init__()
+        device = device if device else torch.device('cpu')
+        assert isinstance(max_wh,(int)) or max_wh is None
+        self.model = model.to(device)
+        self.model.model[-1].end2end = True
+        self.patch_model = ONNX_TRT if max_wh is None else ONNX_ORT
+        self.end2end = self.patch_model(max_obj, iou_thres, score_thres, max_wh, device, n_classes)
+        self.end2end.eval()
+    def forward(self, x):
+        x = self.model(x)
+        x = self.end2end(x)
+        return x
+def attempt_load(weights, device=None, inplace=True, fuse=True):
+    # Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a
+    from models.yolo import Detect, Model
+    model = Ensemble()
+    for w in weights if isinstance(weights, list) else [weights]:
+        ckpt = torch.load(attempt_download(w), map_location='cpu')  # load
+        ckpt = (ckpt.get('ema') or ckpt['model']).to(device).float()  # FP32 model
+        # Model compatibility updates
+        if not hasattr(ckpt, 'stride'):
+            ckpt.stride = torch.tensor([32.])
+        if hasattr(ckpt, 'names') and isinstance(ckpt.names, (list, tuple)):
+            ckpt.names = dict(enumerate(ckpt.names))  # convert to dict
+        model.append(ckpt.fuse().eval() if fuse and hasattr(ckpt, 'fuse') else ckpt.eval())  # model in eval mode
+    # Module compatibility updates
+    for m in model.modules():
+        t = type(m)
+        if t in (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Model):
+            m.inplace = inplace  # torch 1.7.0 compatibility
+            # if t is Detect and not isinstance(m.anchor_grid, list):
+            #    delattr(m, 'anchor_grid')
+            #    setattr(m, 'anchor_grid', [torch.zeros(1)] * m.nl)
+        elif t is nn.Upsample and not hasattr(m, 'recompute_scale_factor'):
+            m.recompute_scale_factor = None  # torch 1.11.0 compatibility
+    # Return model
+    if len(model) == 1:
+        return model[-1]
+    # Return detection ensemble
+    print(f'Ensemble created with {weights}\n')
+    for k in 'names', 'nc', 'yaml':
+        setattr(model, k, getattr(model[0], k))
+    model.stride = model[torch.argmax(torch.tensor([m.stride.max() for m in model])).int()].stride  # max stride
+    assert all(model[0].nc == m.nc for m in model), f'Models have different class counts: {[m.nc for m in model]}'
+    return model

models/repvit.py ADDED Viewed

	@@ -0,0 +1,440 @@

+import torch.nn as nn
+import numpy as np
+import itertools
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+from timm.models.layers import SqueezeExcite
+import torch
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+                 groups=1, bn_weight_init=1, resolution=-10000):
+        super().__init__()
+        self.add_module('c', torch.nn.Conv2d(
+            a, b, ks, stride, pad, dilation, groups, bias=False))
+        self.add_module('bn', torch.nn.BatchNorm2d(b))
+        torch.nn.init.constant_(self.bn.weight, bn_weight_init)
+        torch.nn.init.constant_(self.bn.bias, 0)
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps)**0.5
+        m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
+            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups,
+            device=c.weight.device)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class Residual(torch.nn.Module):
+    def __init__(self, m, drop=0.):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+    def forward(self, x):
+        if self.training and self.drop > 0:
+            return x + self.m(x) * torch.rand(x.size(0), 1, 1, 1,
+                                              device=x.device).ge_(self.drop).div(1 - self.drop).detach()
+        else:
+            return x + self.m(x)
+    @torch.no_grad()
+    def fuse(self):
+        if isinstance(self.m, Conv2d_BN):
+            m = self.m.fuse()
+            assert(m.groups == m.in_channels)
+            identity = torch.ones(m.weight.shape[0], m.weight.shape[1], 1, 1)
+            identity = torch.nn.functional.pad(identity, [1,1,1,1])
+            m.weight += identity.to(m.weight.device)
+            return m
+        elif isinstance(self.m, torch.nn.Conv2d):
+            m = self.m
+            assert(m.groups != m.in_channels)
+            identity = torch.ones(m.weight.shape[0], m.weight.shape[1], 1, 1)
+            identity = torch.nn.functional.pad(identity, [1,1,1,1])
+            m.weight += identity.to(m.weight.device)
+            return m
+        else:
+            return self
+class RepVGGDW(torch.nn.Module):
+    def __init__(self, ed) -> None:
+        super().__init__()
+        self.conv = Conv2d_BN(ed, ed, 3, 1, 1, groups=ed)
+        self.conv1 = torch.nn.Conv2d(ed, ed, 1, 1, 0, groups=ed)
+        self.dim = ed
+        self.bn = torch.nn.BatchNorm2d(ed)
+    def forward(self, x):
+        return self.bn((self.conv(x) + self.conv1(x)) + x)
+    @torch.no_grad()
+    def fuse(self):
+        conv = self.conv.fuse()
+        conv1 = self.conv1
+        conv_w = conv.weight
+        conv_b = conv.bias
+        conv1_w = conv1.weight
+        conv1_b = conv1.bias
+        conv1_w = torch.nn.functional.pad(conv1_w, [1,1,1,1])
+        identity = torch.nn.functional.pad(torch.ones(conv1_w.shape[0], conv1_w.shape[1], 1, 1, device=conv1_w.device), [1,1,1,1])
+        final_conv_w = conv_w + conv1_w + identity
+        final_conv_b = conv_b + conv1_b
+        conv.weight.data.copy_(final_conv_w)
+        conv.bias.data.copy_(final_conv_b)
+        bn = self.bn
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        w = conv.weight * w[:, None, None, None]
+        b = bn.bias + (conv.bias - bn.running_mean) * bn.weight / \
+            (bn.running_var + bn.eps)**0.5
+        conv.weight.data.copy_(w)
+        conv.bias.data.copy_(b)
+        return conv
+class RepViTBlock(nn.Module):
+    def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs):
+        super(RepViTBlock, self).__init__()
+        assert stride in [1, 2]
+        self.identity = stride == 1 and inp == oup
+        assert(hidden_dim == 2 * inp)
+        if stride == 2:
+            self.token_mixer = nn.Sequential(
+                Conv2d_BN(inp, inp, kernel_size, stride, (kernel_size - 1) // 2, groups=inp),
+                SqueezeExcite(inp, 0.25) if use_se else nn.Identity(),
+                Conv2d_BN(inp, oup, ks=1, stride=1, pad=0)
+            )
+            self.channel_mixer = Residual(nn.Sequential(
+                    # pw
+                    Conv2d_BN(oup, 2 * oup, 1, 1, 0),
+                    nn.GELU() if use_hs else nn.GELU(),
+                    # pw-linear
+                    Conv2d_BN(2 * oup, oup, 1, 1, 0, bn_weight_init=0),
+                ))
+        else:
+            assert(self.identity)
+            self.token_mixer = nn.Sequential(
+                RepVGGDW(inp),
+                SqueezeExcite(inp, 0.25) if use_se else nn.Identity(),
+            )
+            self.channel_mixer = Residual(nn.Sequential(
+                    # pw
+                    Conv2d_BN(inp, hidden_dim, 1, 1, 0),
+                    nn.GELU() if use_hs else nn.GELU(),
+                    # pw-linear
+                    Conv2d_BN(hidden_dim, oup, 1, 1, 0, bn_weight_init=0),
+                ))
+    def forward(self, x):
+        return self.channel_mixer(self.token_mixer(x))
+from timm.models.vision_transformer import trunc_normal_
+class BN_Linear(torch.nn.Sequential):
+    def __init__(self, a, b, bias=True, std=0.02):
+        super().__init__()
+        self.add_module('bn', torch.nn.BatchNorm1d(a))
+        self.add_module('l', torch.nn.Linear(a, b, bias=bias))
+        trunc_normal_(self.l.weight, std=std)
+        if bias:
+            torch.nn.init.constant_(self.l.bias, 0)
+    @torch.no_grad()
+    def fuse(self):
+        bn, l = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        b = bn.bias - self.bn.running_mean * \
+            self.bn.weight / (bn.running_var + bn.eps)**0.5
+        w = l.weight * w[None, :]
+        if l.bias is None:
+            b = b @ self.l.weight.T
+        else:
+            b = (l.weight @ b[:, None]).view(-1) + self.l.bias
+        m = torch.nn.Linear(w.size(1), w.size(0), device=l.weight.device)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class RepViT(nn.Module):
+    def __init__(self, cfgs, distillation=False, pretrained=None, init_cfg=None, out_indices=[]):
+        super(RepViT, self).__init__()
+        # setting of inverted residual blocks
+        self.cfgs = cfgs
+        # building first layer
+        input_channel = self.cfgs[0][2]
+        patch_embed = torch.nn.Sequential(Conv2d_BN(3, input_channel // 2, 3, 2, 1), torch.nn.GELU() )
+        layers = [patch_embed]
+        patch_embed2 = torch.nn.Sequential(Conv2d_BN(input_channel // 2, input_channel, 3, 2, 1), torch.nn.GELU())
+        layers.append(patch_embed2)
+        # building inverted residual blocks
+        block = RepViTBlock
+        for k, t, c, use_se, use_hs, s in self.cfgs:
+            output_channel = _make_divisible(c, 8)
+            exp_size = _make_divisible(input_channel * t, 8)
+            layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs))
+            input_channel = output_channel
+        self.features = nn.ModuleList(layers)
+        #
+        # self.init_cfg = init_cfg
+        # assert(self.init_cfg is not None)
+        self.out_indices = out_indices
+        #self = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self)
+        self.train()
+        self.out_indices=[0,5,11, 37, 42]
+        #               320 160 80  40    20
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(RepViT, self).train(mode)
+    def forward(self, x):
+        outs = []
+        for i, f in enumerate(self.features):
+            x = f(x)
+            #print(x.shape)
+            if i in self.out_indices:
+                outs.append(x)
+                #print(x.shape)
+       # assert(len(outs) == 4)
+        return outs
+from timm.models import register_model
+def repvit_m1_1(pretrained=False, num_classes = 1000, distillation=False, init_cfg=None, out_indices=[], **kwargs):
+    """
+    Constructs a MobileNetV3-Large model
+    """
+    cfgs = [
+        # k, t, c, SE, HS, s
+        [3,   2,  64, 1, 0, 1],
+        [3,   2,  64, 0, 0, 1],
+        [3,   2,  64, 0, 0, 1],
+        [3,   2,  128, 0, 0, 2],
+        [3,   2,  128, 1, 0, 1],
+        [3,   2,  128, 0, 0, 1],
+        [3,   2,  128, 0, 0, 1],
+        [3,   2,  256, 0, 1, 2],
+        [3,   2,  256, 1, 1, 1],
+        [3,   2,  256, 0, 1, 1],
+        [3,   2,  256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 512, 0, 1, 2],
+        [3,   2, 512, 1, 1, 1],
+        [3,   2, 512, 0, 1, 1]
+    ]
+    return RepViT(cfgs, init_cfg=init_cfg, pretrained=pretrained, distillation=distillation, out_indices=out_indices)
+def repvit_m1_5(pretrained=False, num_classes = 1000, distillation=False, init_cfg=None, out_indices=[], **kwargs):
+    """
+    Constructs a MobileNetV3-Large model
+    """
+    cfgs = [
+        # k, t, c, SE, HS, s
+        [3,   2,  64, 1, 0, 1],
+        [3,   2,  64, 0, 0, 1],
+        [3,   2,  64, 1, 0, 1],
+        [3,   2,  64, 0, 0, 1],
+        [3,   2,  64, 0, 0, 1],
+        [3,   2,  128, 0, 0, 2],
+        [3,   2,  128, 1, 0, 1],
+        [3,   2,  128, 0, 0, 1],
+        [3,   2,  128, 1, 0, 1],
+        [3,   2,  128, 0, 0, 1],
+        [3,   2,  128, 0, 0, 1],
+        [3,   2,  256, 0, 1, 2],
+        [3,   2,  256, 1, 1, 1],
+        [3,   2,  256, 0, 1, 1],
+        [3,   2,  256, 1, 1, 1],
+        [3,   2,  256, 0, 1, 1],
+        [3,   2,  256, 1, 1, 1],
+        [3,   2,  256, 0, 1, 1],
+        [3,   2,  256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 1, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 256, 0, 1, 1],
+        [3,   2, 512, 0, 1, 2],
+        [3,   2, 512, 1, 1, 1],
+        [3,   2, 512, 0, 1, 1],
+        [3,   2, 512, 1, 1, 1],
+        [3,   2, 512, 0, 1, 1]
+    ]
+    return RepViT(cfgs, init_cfg=init_cfg, pretrained=pretrained, distillation=distillation, out_indices=out_indices)
+def repvit_m2_3(pretrained=False, num_classes = 1000, distillation=False, init_cfg=None, out_indices=[], **kwargs):
+    """
+    Constructs a MobileNetV3-Large model
+    """
+    cfgs = [
+        # k, t, c, SE, HS, s
+        [3,   2,  80, 1, 0, 1],
+        [3,   2,  80, 0, 0, 1],
+        [3,   2,  80, 1, 0, 1],
+        [3,   2,  80, 0, 0, 1],
+        [3,   2,  80, 1, 0, 1],
+        [3,   2,  80, 0, 0, 1],
+        [3,   2,  80, 0, 0, 1],
+        [3,   2,  160, 0, 0, 2],
+        [3,   2,  160, 1, 0, 1],
+        [3,   2,  160, 0, 0, 1],
+        [3,   2,  160, 1, 0, 1],
+        [3,   2,  160, 0, 0, 1],
+        [3,   2,  160, 1, 0, 1],
+        [3,   2,  160, 0, 0, 1],
+        [3,   2,  160, 0, 0, 1],
+        [3,   2,  320, 0, 1, 2],
+        [3,   2,  320, 1, 1, 1],
+        [3,   2,  320, 0, 1, 1],
+        [3,   2,  320, 1, 1, 1],
+        [3,   2,  320, 0, 1, 1],
+        [3,   2,  320, 1, 1, 1],
+        [3,   2,  320, 0, 1, 1],
+        [3,   2,  320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 1, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        # [3,   2, 320, 1, 1, 1],
+        # [3,   2, 320, 0, 1, 1],
+        [3,   2, 320, 0, 1, 1],
+        [3,   2, 640, 0, 1, 2],
+        [3,   2, 640, 1, 1, 1],
+        [3,   2, 640, 0, 1, 1],
+        # [3,   2, 640, 1, 1, 1],
+        # [3,   2, 640, 0, 1, 1]
+    ]
+    return RepViT(cfgs, init_cfg=init_cfg, pretrained=pretrained, distillation=distillation, out_indices=out_indices)
+cfgs = [
+        # k, t, c, SE, HS, s
+        [3,   2,  64*2, 1, 0, 1],
+        [3,   2,  64*2, 0, 0, 1],
+        [3,   2,  64*2, 1, 0, 1],
+        [3,   2,  64*2, 0, 0, 1],
+        [3,   2,  64*2, 0, 0, 1],
+        [3,   2,  128*2, 0, 0, 2],
+        [3,   2,  128*2, 1, 0, 1],
+        [3,   2,  128*2, 0, 0, 1],
+        [3,   2,  128*2, 1, 0, 1],
+        [3,   2,  128*2, 0, 0, 1],
+        [3,   2,  128*2, 0, 0, 1],
+        [3,   2,  256*2, 0, 1, 2],
+        [3,   2,  256*2, 1, 1, 1],
+        [3,   2,  256*2, 0, 1, 1],
+        [3,   2,  256*2, 1, 1, 1],
+        [3,   2,  256*2, 0, 1, 1],
+        [3,   2,  256*2, 1, 1, 1],
+        [3,   2,  256*2, 0, 1, 1],
+        [3,   2,  256*2, 1, 1, 1],
+        [3,   2, 256*2, 0, 1, 1],
+        [3,   2, 256*2, 1, 1, 1],
+        [3,   2, 256*2, 0, 1, 1],
+        [3,   2, 256*2, 1, 1, 1],
+        [3,   2, 256*2, 0, 1, 1],
+        [3,   2, 256*2, 1, 1, 1],
+        [3,   2, 256*2, 0, 1, 1],
+        [3,   2, 256*2, 1, 1, 1],
+        [3,   2, 256*2, 0, 1, 1],
+        [3,   2, 256*2, 1, 1, 1],
+        [3,   2, 256*2, 0, 1, 1],
+        [3,   2, 256*2, 1, 1, 1],
+        [3,   2, 256*2, 0, 1, 1],
+        [3,   2, 256*2, 1, 1, 1],
+        [3,   2, 256*2, 0, 1, 1],
+        [3,   2, 256*2, 1, 1, 1],
+        [3,   2, 256*2, 0, 1, 1],
+        [3,   2, 256*2, 0, 1, 1],
+        [3,   2, 512*2, 0, 1, 2],
+        [3,   2, 512*2, 1, 1, 1],
+        [3,   2, 512*2, 0, 1, 1],
+        [3,   2, 512*2, 1, 1, 1],
+        [3,   2, 512*2, 0, 1, 1]
+    ]
+if __name__ =="__main__":
+    model  = RepViT(cfgs )
+    t1 = torch.rand(1,3,640,640)
+    x = model(t1)

models/tf.py ADDED Viewed

	@@ -0,0 +1,596 @@

+import argparse
+import sys
+from copy import deepcopy
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[1]  # YOLO root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+# ROOT = ROOT.relative_to(Path.cwd())  # relative
+import numpy as np
+import tensorflow as tf
+import torch
+import torch.nn as nn
+from tensorflow import keras
+from models.common import (C3, SPP, SPPF, Bottleneck, BottleneckCSP, C3x, Concat, Conv, CrossConv, DWConv,
+                           DWConvTranspose2d, Focus, autopad)
+from models.experimental import MixConv2d, attempt_load
+from models.yolo import Detect, Segment
+from utils.activations import SiLU
+from utils.general import LOGGER, make_divisible, print_args
+class TFBN(keras.layers.Layer):
+    # TensorFlow BatchNormalization wrapper
+    def __init__(self, w=None):
+        super().__init__()
+        self.bn = keras.layers.BatchNormalization(
+            beta_initializer=keras.initializers.Constant(w.bias.numpy()),
+            gamma_initializer=keras.initializers.Constant(w.weight.numpy()),
+            moving_mean_initializer=keras.initializers.Constant(w.running_mean.numpy()),
+            moving_variance_initializer=keras.initializers.Constant(w.running_var.numpy()),
+            epsilon=w.eps)
+    def call(self, inputs):
+        return self.bn(inputs)
+class TFPad(keras.layers.Layer):
+    # Pad inputs in spatial dimensions 1 and 2
+    def __init__(self, pad):
+        super().__init__()
+        if isinstance(pad, int):
+            self.pad = tf.constant([[0, 0], [pad, pad], [pad, pad], [0, 0]])
+        else:  # tuple/list
+            self.pad = tf.constant([[0, 0], [pad[0], pad[0]], [pad[1], pad[1]], [0, 0]])
+    def call(self, inputs):
+        return tf.pad(inputs, self.pad, mode='constant', constant_values=0)
+class TFConv(keras.layers.Layer):
+    # Standard convolution
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None):
+        # ch_in, ch_out, weights, kernel, stride, padding, groups
+        super().__init__()
+        assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument"
+        # TensorFlow convolution padding is inconsistent with PyTorch (e.g. k=3 s=2 'SAME' padding)
+        # see https://stackoverflow.com/questions/52975843/comparing-conv2d-with-padding-between-tensorflow-and-pytorch
+        conv = keras.layers.Conv2D(
+            filters=c2,
+            kernel_size=k,
+            strides=s,
+            padding='SAME' if s == 1 else 'VALID',
+            use_bias=not hasattr(w, 'bn'),
+            kernel_initializer=keras.initializers.Constant(w.conv.weight.permute(2, 3, 1, 0).numpy()),
+            bias_initializer='zeros' if hasattr(w, 'bn') else keras.initializers.Constant(w.conv.bias.numpy()))
+        self.conv = conv if s == 1 else keras.Sequential([TFPad(autopad(k, p)), conv])
+        self.bn = TFBN(w.bn) if hasattr(w, 'bn') else tf.identity
+        self.act = activations(w.act) if act else tf.identity
+    def call(self, inputs):
+        return self.act(self.bn(self.conv(inputs)))
+class TFDWConv(keras.layers.Layer):
+    # Depthwise convolution
+    def __init__(self, c1, c2, k=1, s=1, p=None, act=True, w=None):
+        # ch_in, ch_out, weights, kernel, stride, padding, groups
+        super().__init__()
+        assert c2 % c1 == 0, f'TFDWConv() output={c2} must be a multiple of input={c1} channels'
+        conv = keras.layers.DepthwiseConv2D(
+            kernel_size=k,
+            depth_multiplier=c2 // c1,
+            strides=s,
+            padding='SAME' if s == 1 else 'VALID',
+            use_bias=not hasattr(w, 'bn'),
+            depthwise_initializer=keras.initializers.Constant(w.conv.weight.permute(2, 3, 1, 0).numpy()),
+            bias_initializer='zeros' if hasattr(w, 'bn') else keras.initializers.Constant(w.conv.bias.numpy()))
+        self.conv = conv if s == 1 else keras.Sequential([TFPad(autopad(k, p)), conv])
+        self.bn = TFBN(w.bn) if hasattr(w, 'bn') else tf.identity
+        self.act = activations(w.act) if act else tf.identity
+    def call(self, inputs):
+        return self.act(self.bn(self.conv(inputs)))
+class TFDWConvTranspose2d(keras.layers.Layer):
+    # Depthwise ConvTranspose2d
+    def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0, w=None):
+        # ch_in, ch_out, weights, kernel, stride, padding, groups
+        super().__init__()
+        assert c1 == c2, f'TFDWConv() output={c2} must be equal to input={c1} channels'
+        assert k == 4 and p1 == 1, 'TFDWConv() only valid for k=4 and p1=1'
+        weight, bias = w.weight.permute(2, 3, 1, 0).numpy(), w.bias.numpy()
+        self.c1 = c1
+        self.conv = [
+            keras.layers.Conv2DTranspose(filters=1,
+                                         kernel_size=k,
+                                         strides=s,
+                                         padding='VALID',
+                                         output_padding=p2,
+                                         use_bias=True,
+                                         kernel_initializer=keras.initializers.Constant(weight[..., i:i + 1]),
+                                         bias_initializer=keras.initializers.Constant(bias[i])) for i in range(c1)]
+    def call(self, inputs):
+        return tf.concat([m(x) for m, x in zip(self.conv, tf.split(inputs, self.c1, 3))], 3)[:, 1:-1, 1:-1]
+class TFFocus(keras.layers.Layer):
+    # Focus wh information into c-space
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None):
+        # ch_in, ch_out, kernel, stride, padding, groups
+        super().__init__()
+        self.conv = TFConv(c1 * 4, c2, k, s, p, g, act, w.conv)
+    def call(self, inputs):  # x(b,w,h,c) -> y(b,w/2,h/2,4c)
+        # inputs = inputs / 255  # normalize 0-255 to 0-1
+        inputs = [inputs[:, ::2, ::2, :], inputs[:, 1::2, ::2, :], inputs[:, ::2, 1::2, :], inputs[:, 1::2, 1::2, :]]
+        return self.conv(tf.concat(inputs, 3))
+class TFBottleneck(keras.layers.Layer):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, w=None):  # ch_in, ch_out, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
+        self.cv2 = TFConv(c_, c2, 3, 1, g=g, w=w.cv2)
+        self.add = shortcut and c1 == c2
+    def call(self, inputs):
+        return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs))
+class TFCrossConv(keras.layers.Layer):
+    # Cross Convolution
+    def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False, w=None):
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = TFConv(c1, c_, (1, k), (1, s), w=w.cv1)
+        self.cv2 = TFConv(c_, c2, (k, 1), (s, 1), g=g, w=w.cv2)
+        self.add = shortcut and c1 == c2
+    def call(self, inputs):
+        return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs))
+class TFConv2d(keras.layers.Layer):
+    # Substitution for PyTorch nn.Conv2D
+    def __init__(self, c1, c2, k, s=1, g=1, bias=True, w=None):
+        super().__init__()
+        assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument"
+        self.conv = keras.layers.Conv2D(filters=c2,
+                                        kernel_size=k,
+                                        strides=s,
+                                        padding='VALID',
+                                        use_bias=bias,
+                                        kernel_initializer=keras.initializers.Constant(
+                                            w.weight.permute(2, 3, 1, 0).numpy()),
+                                        bias_initializer=keras.initializers.Constant(w.bias.numpy()) if bias else None)
+    def call(self, inputs):
+        return self.conv(inputs)
+class TFBottleneckCSP(keras.layers.Layer):
+    # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
+        self.cv2 = TFConv2d(c1, c_, 1, 1, bias=False, w=w.cv2)
+        self.cv3 = TFConv2d(c_, c_, 1, 1, bias=False, w=w.cv3)
+        self.cv4 = TFConv(2 * c_, c2, 1, 1, w=w.cv4)
+        self.bn = TFBN(w.bn)
+        self.act = lambda x: keras.activations.swish(x)
+        self.m = keras.Sequential([TFBottleneck(c_, c_, shortcut, g, e=1.0, w=w.m[j]) for j in range(n)])
+    def call(self, inputs):
+        y1 = self.cv3(self.m(self.cv1(inputs)))
+        y2 = self.cv2(inputs)
+        return self.cv4(self.act(self.bn(tf.concat((y1, y2), axis=3))))
+class TFC3(keras.layers.Layer):
+    # CSP Bottleneck with 3 convolutions
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
+        self.cv2 = TFConv(c1, c_, 1, 1, w=w.cv2)
+        self.cv3 = TFConv(2 * c_, c2, 1, 1, w=w.cv3)
+        self.m = keras.Sequential([TFBottleneck(c_, c_, shortcut, g, e=1.0, w=w.m[j]) for j in range(n)])
+    def call(self, inputs):
+        return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3))
+class TFC3x(keras.layers.Layer):
+    # 3 module with cross-convolutions
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
+        self.cv2 = TFConv(c1, c_, 1, 1, w=w.cv2)
+        self.cv3 = TFConv(2 * c_, c2, 1, 1, w=w.cv3)
+        self.m = keras.Sequential([
+            TFCrossConv(c_, c_, k=3, s=1, g=g, e=1.0, shortcut=shortcut, w=w.m[j]) for j in range(n)])
+    def call(self, inputs):
+        return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3))
+class TFSPP(keras.layers.Layer):
+    # Spatial pyramid pooling layer used in YOLOv3-SPP
+    def __init__(self, c1, c2, k=(5, 9, 13), w=None):
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
+        self.cv2 = TFConv(c_ * (len(k) + 1), c2, 1, 1, w=w.cv2)
+        self.m = [keras.layers.MaxPool2D(pool_size=x, strides=1, padding='SAME') for x in k]
+    def call(self, inputs):
+        x = self.cv1(inputs)
+        return self.cv2(tf.concat([x] + [m(x) for m in self.m], 3))
+class TFSPPF(keras.layers.Layer):
+    # Spatial pyramid pooling-Fast layer
+    def __init__(self, c1, c2, k=5, w=None):
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
+        self.cv2 = TFConv(c_ * 4, c2, 1, 1, w=w.cv2)
+        self.m = keras.layers.MaxPool2D(pool_size=k, strides=1, padding='SAME')
+    def call(self, inputs):
+        x = self.cv1(inputs)
+        y1 = self.m(x)
+        y2 = self.m(y1)
+        return self.cv2(tf.concat([x, y1, y2, self.m(y2)], 3))
+class TFDetect(keras.layers.Layer):
+    # TF YOLO Detect layer
+    def __init__(self, nc=80, anchors=(), ch=(), imgsz=(640, 640), w=None):  # detection layer
+        super().__init__()
+        self.stride = tf.convert_to_tensor(w.stride.numpy(), dtype=tf.float32)
+        self.nc = nc  # number of classes
+        self.no = nc + 5  # number of outputs per anchor
+        self.nl = len(anchors)  # number of detection layers
+        self.na = len(anchors[0]) // 2  # number of anchors
+        self.grid = [tf.zeros(1)] * self.nl  # init grid
+        self.anchors = tf.convert_to_tensor(w.anchors.numpy(), dtype=tf.float32)
+        self.anchor_grid = tf.reshape(self.anchors * tf.reshape(self.stride, [self.nl, 1, 1]), [self.nl, 1, -1, 1, 2])
+        self.m = [TFConv2d(x, self.no * self.na, 1, w=w.m[i]) for i, x in enumerate(ch)]
+        self.training = False  # set to False after building model
+        self.imgsz = imgsz
+        for i in range(self.nl):
+            ny, nx = self.imgsz[0] // self.stride[i], self.imgsz[1] // self.stride[i]
+            self.grid[i] = self._make_grid(nx, ny)
+    def call(self, inputs):
+        z = []  # inference output
+        x = []
+        for i in range(self.nl):
+            x.append(self.m[i](inputs[i]))
+            # x(bs,20,20,255) to x(bs,3,20,20,85)
+            ny, nx = self.imgsz[0] // self.stride[i], self.imgsz[1] // self.stride[i]
+            x[i] = tf.reshape(x[i], [-1, ny * nx, self.na, self.no])
+            if not self.training:  # inference
+                y = x[i]
+                grid = tf.transpose(self.grid[i], [0, 2, 1, 3]) - 0.5
+                anchor_grid = tf.transpose(self.anchor_grid[i], [0, 2, 1, 3]) * 4
+                xy = (tf.sigmoid(y[..., 0:2]) * 2 + grid) * self.stride[i]  # xy
+                wh = tf.sigmoid(y[..., 2:4]) ** 2 * anchor_grid
+                # Normalize xywh to 0-1 to reduce calibration error
+                xy /= tf.constant([[self.imgsz[1], self.imgsz[0]]], dtype=tf.float32)
+                wh /= tf.constant([[self.imgsz[1], self.imgsz[0]]], dtype=tf.float32)
+                y = tf.concat([xy, wh, tf.sigmoid(y[..., 4:5 + self.nc]), y[..., 5 + self.nc:]], -1)
+                z.append(tf.reshape(y, [-1, self.na * ny * nx, self.no]))
+        return tf.transpose(x, [0, 2, 1, 3]) if self.training else (tf.concat(z, 1),)
+    @staticmethod
+    def _make_grid(nx=20, ny=20):
+        # yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
+        # return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
+        xv, yv = tf.meshgrid(tf.range(nx), tf.range(ny))
+        return tf.cast(tf.reshape(tf.stack([xv, yv], 2), [1, 1, ny * nx, 2]), dtype=tf.float32)
+class TFSegment(TFDetect):
+    # YOLO Segment head for segmentation models
+    def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), imgsz=(640, 640), w=None):
+        super().__init__(nc, anchors, ch, imgsz, w)
+        self.nm = nm  # number of masks
+        self.npr = npr  # number of protos
+        self.no = 5 + nc + self.nm  # number of outputs per anchor
+        self.m = [TFConv2d(x, self.no * self.na, 1, w=w.m[i]) for i, x in enumerate(ch)]  # output conv
+        self.proto = TFProto(ch[0], self.npr, self.nm, w=w.proto)  # protos
+        self.detect = TFDetect.call
+    def call(self, x):
+        p = self.proto(x[0])
+        # p = TFUpsample(None, scale_factor=4, mode='nearest')(self.proto(x[0]))  # (optional) full-size protos
+        p = tf.transpose(p, [0, 3, 1, 2])  # from shape(1,160,160,32) to shape(1,32,160,160)
+        x = self.detect(self, x)
+        return (x, p) if self.training else (x[0], p)
+class TFProto(keras.layers.Layer):
+    def __init__(self, c1, c_=256, c2=32, w=None):
+        super().__init__()
+        self.cv1 = TFConv(c1, c_, k=3, w=w.cv1)
+        self.upsample = TFUpsample(None, scale_factor=2, mode='nearest')
+        self.cv2 = TFConv(c_, c_, k=3, w=w.cv2)
+        self.cv3 = TFConv(c_, c2, w=w.cv3)
+    def call(self, inputs):
+        return self.cv3(self.cv2(self.upsample(self.cv1(inputs))))
+class TFUpsample(keras.layers.Layer):
+    # TF version of torch.nn.Upsample()
+    def __init__(self, size, scale_factor, mode, w=None):  # warning: all arguments needed including 'w'
+        super().__init__()
+        assert scale_factor % 2 == 0, "scale_factor must be multiple of 2"
+        self.upsample = lambda x: tf.image.resize(x, (x.shape[1] * scale_factor, x.shape[2] * scale_factor), mode)
+        # self.upsample = keras.layers.UpSampling2D(size=scale_factor, interpolation=mode)
+        # with default arguments: align_corners=False, half_pixel_centers=False
+        # self.upsample = lambda x: tf.raw_ops.ResizeNearestNeighbor(images=x,
+        #                                                            size=(x.shape[1] * 2, x.shape[2] * 2))
+    def call(self, inputs):
+        return self.upsample(inputs)
+class TFConcat(keras.layers.Layer):
+    # TF version of torch.concat()
+    def __init__(self, dimension=1, w=None):
+        super().__init__()
+        assert dimension == 1, "convert only NCHW to NHWC concat"
+        self.d = 3
+    def call(self, inputs):
+        return tf.concat(inputs, self.d)
+def parse_model(d, ch, model, imgsz):  # model_dict, input_channels(3)
+    LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10}  {'module':<40}{'arguments':<30}")
+    anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
+    na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # number of anchors
+    no = na * (nc + 5)  # number of outputs = anchors * (classes + 5)
+    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
+    for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
+        m_str = m
+        m = eval(m) if isinstance(m, str) else m  # eval strings
+        for j, a in enumerate(args):
+            try:
+                args[j] = eval(a) if isinstance(a, str) else a  # eval strings
+            except NameError:
+                pass
+        n = max(round(n * gd), 1) if n > 1 else n  # depth gain
+        if m in [
+                nn.Conv2d, Conv, DWConv, DWConvTranspose2d, Bottleneck, SPP, SPPF, MixConv2d, Focus, CrossConv,
+                BottleneckCSP, C3, C3x]:
+            c1, c2 = ch[f], args[0]
+            c2 = make_divisible(c2 * gw, 8) if c2 != no else c2
+            args = [c1, c2, *args[1:]]
+            if m in [BottleneckCSP, C3, C3x]:
+                args.insert(2, n)
+                n = 1
+        elif m is nn.BatchNorm2d:
+            args = [ch[f]]
+        elif m is Concat:
+            c2 = sum(ch[-1 if x == -1 else x + 1] for x in f)
+        elif m in [Detect, Segment]:
+            args.append([ch[x + 1] for x in f])
+            if isinstance(args[1], int):  # number of anchors
+                args[1] = [list(range(args[1] * 2))] * len(f)
+            if m is Segment:
+                args[3] = make_divisible(args[3] * gw, 8)
+            args.append(imgsz)
+        else:
+            c2 = ch[f]
+        tf_m = eval('TF' + m_str.replace('nn.', ''))
+        m_ = keras.Sequential([tf_m(*args, w=model.model[i][j]) for j in range(n)]) if n > 1 \
+            else tf_m(*args, w=model.model[i])  # module
+        torch_m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module
+        t = str(m)[8:-2].replace('__main__.', '')  # module type
+        np = sum(x.numel() for x in torch_m_.parameters())  # number params
+        m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
+        LOGGER.info(f'{i:>3}{str(f):>18}{str(n):>3}{np:>10}  {t:<40}{str(args):<30}')  # print
+        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
+        layers.append(m_)
+        ch.append(c2)
+    return keras.Sequential(layers), sorted(save)
+class TFModel:
+    # TF YOLO model
+    def __init__(self, cfg='yolo.yaml', ch=3, nc=None, model=None, imgsz=(640, 640)):  # model, channels, classes
+        super().__init__()
+        if isinstance(cfg, dict):
+            self.yaml = cfg  # model dict
+        else:  # is *.yaml
+            import yaml  # for torch hub
+            self.yaml_file = Path(cfg).name
+            with open(cfg) as f:
+                self.yaml = yaml.load(f, Loader=yaml.FullLoader)  # model dict
+        # Define model
+        if nc and nc != self.yaml['nc']:
+            LOGGER.info(f"Overriding {cfg} nc={self.yaml['nc']} with nc={nc}")
+            self.yaml['nc'] = nc  # override yaml value
+        self.model, self.savelist = parse_model(deepcopy(self.yaml), ch=[ch], model=model, imgsz=imgsz)
+    def predict(self,
+                inputs,
+                tf_nms=False,
+                agnostic_nms=False,
+                topk_per_class=100,
+                topk_all=100,
+                iou_thres=0.45,
+                conf_thres=0.25):
+        y = []  # outputs
+        x = inputs
+        for m in self.model.layers:
+            if m.f != -1:  # if not from previous layer
+                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
+            x = m(x)  # run
+            y.append(x if m.i in self.savelist else None)  # save output
+        # Add TensorFlow NMS
+        if tf_nms:
+            boxes = self._xywh2xyxy(x[0][..., :4])
+            probs = x[0][:, :, 4:5]
+            classes = x[0][:, :, 5:]
+            scores = probs * classes
+            if agnostic_nms:
+                nms = AgnosticNMS()((boxes, classes, scores), topk_all, iou_thres, conf_thres)
+            else:
+                boxes = tf.expand_dims(boxes, 2)
+                nms = tf.image.combined_non_max_suppression(boxes,
+                                                            scores,
+                                                            topk_per_class,
+                                                            topk_all,
+                                                            iou_thres,
+                                                            conf_thres,
+                                                            clip_boxes=False)
+            return (nms,)
+        return x  # output [1,6300,85] = [xywh, conf, class0, class1, ...]
+        # x = x[0]  # [x(1,6300,85), ...] to x(6300,85)
+        # xywh = x[..., :4]  # x(6300,4) boxes
+        # conf = x[..., 4:5]  # x(6300,1) confidences
+        # cls = tf.reshape(tf.cast(tf.argmax(x[..., 5:], axis=1), tf.float32), (-1, 1))  # x(6300,1)  classes
+        # return tf.concat([conf, cls, xywh], 1)
+    @staticmethod
+    def _xywh2xyxy(xywh):
+        # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+        x, y, w, h = tf.split(xywh, num_or_size_splits=4, axis=-1)
+        return tf.concat([x - w / 2, y - h / 2, x + w / 2, y + h / 2], axis=-1)
+class AgnosticNMS(keras.layers.Layer):
+    # TF Agnostic NMS
+    def call(self, input, topk_all, iou_thres, conf_thres):
+        # wrap map_fn to avoid TypeSpec related error https://stackoverflow.com/a/65809989/3036450
+        return tf.map_fn(lambda x: self._nms(x, topk_all, iou_thres, conf_thres),
+                         input,
+                         fn_output_signature=(tf.float32, tf.float32, tf.float32, tf.int32),
+                         name='agnostic_nms')
+    @staticmethod
+    def _nms(x, topk_all=100, iou_thres=0.45, conf_thres=0.25):  # agnostic NMS
+        boxes, classes, scores = x
+        class_inds = tf.cast(tf.argmax(classes, axis=-1), tf.float32)
+        scores_inp = tf.reduce_max(scores, -1)
+        selected_inds = tf.image.non_max_suppression(boxes,
+                                                     scores_inp,
+                                                     max_output_size=topk_all,
+                                                     iou_threshold=iou_thres,
+                                                     score_threshold=conf_thres)
+        selected_boxes = tf.gather(boxes, selected_inds)
+        padded_boxes = tf.pad(selected_boxes,
+                              paddings=[[0, topk_all - tf.shape(selected_boxes)[0]], [0, 0]],
+                              mode="CONSTANT",
+                              constant_values=0.0)
+        selected_scores = tf.gather(scores_inp, selected_inds)
+        padded_scores = tf.pad(selected_scores,
+                               paddings=[[0, topk_all - tf.shape(selected_boxes)[0]]],
+                               mode="CONSTANT",
+                               constant_values=-1.0)
+        selected_classes = tf.gather(class_inds, selected_inds)
+        padded_classes = tf.pad(selected_classes,
+                                paddings=[[0, topk_all - tf.shape(selected_boxes)[0]]],
+                                mode="CONSTANT",
+                                constant_values=-1.0)
+        valid_detections = tf.shape(selected_inds)[0]
+        return padded_boxes, padded_scores, padded_classes, valid_detections
+def activations(act=nn.SiLU):
+    # Returns TF activation from input PyTorch activation
+    if isinstance(act, nn.LeakyReLU):
+        return lambda x: keras.activations.relu(x, alpha=0.1)
+    elif isinstance(act, nn.Hardswish):
+        return lambda x: x * tf.nn.relu6(x + 3) * 0.166666667
+    elif isinstance(act, (nn.SiLU, SiLU)):
+        return lambda x: keras.activations.swish(x)
+    else:
+        raise Exception(f'no matching TensorFlow activation found for PyTorch activation {act}')
+def representative_dataset_gen(dataset, ncalib=100):
+    # Representative dataset generator for use with converter.representative_dataset, returns a generator of np arrays
+    for n, (path, img, im0s, vid_cap, string) in enumerate(dataset):
+        im = np.transpose(img, [1, 2, 0])
+        im = np.expand_dims(im, axis=0).astype(np.float32)
+        im /= 255
+        yield [im]
+        if n >= ncalib:
+            break
+def run(
+        weights=ROOT / 'yolo.pt',  # weights path
+        imgsz=(640, 640),  # inference size h,w
+        batch_size=1,  # batch size
+        dynamic=False,  # dynamic batch size
+):
+    # PyTorch model
+    im = torch.zeros((batch_size, 3, *imgsz))  # BCHW image
+    model = attempt_load(weights, device=torch.device('cpu'), inplace=True, fuse=False)
+    _ = model(im)  # inference
+    model.info()
+    # TensorFlow model
+    im = tf.zeros((batch_size, *imgsz, 3))  # BHWC image
+    tf_model = TFModel(cfg=model.yaml, model=model, nc=model.nc, imgsz=imgsz)
+    _ = tf_model.predict(im)  # inference
+    # Keras model
+    im = keras.Input(shape=(*imgsz, 3), batch_size=None if dynamic else batch_size)
+    keras_model = keras.Model(inputs=im, outputs=tf_model.predict(im))
+    keras_model.summary()
+    LOGGER.info('PyTorch, TensorFlow and Keras models successfully verified.\nUse export.py for TF model export.')
+def parse_opt():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--weights', type=str, default=ROOT / 'yolo.pt', help='weights path')
+    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640], help='inference size h,w')
+    parser.add_argument('--batch-size', type=int, default=1, help='batch size')
+    parser.add_argument('--dynamic', action='store_true', help='dynamic batch size')
+    opt = parser.parse_args()
+    opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1  # expand
+    print_args(vars(opt))
+    return opt
+def main(opt):
+    run(**vars(opt))
+if __name__ == "__main__":
+    opt = parse_opt()
+    main(opt)

models/yolo.py ADDED Viewed

	@@ -0,0 +1,771 @@

+import argparse
+import os
+import platform
+import sys
+from copy import deepcopy
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[1]  # YOLO root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+if platform.system() != 'Windows':
+    ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
+from models.common import *
+from models.experimental import *
+from utils.general import LOGGER, check_version, check_yaml, make_divisible, print_args
+from utils.plots import feature_visualization
+from utils.torch_utils import (fuse_conv_and_bn, initialize_weights, model_info, profile, scale_img, select_device,
+                               time_sync)
+from utils.tal.anchor_generator import make_anchors, dist2bbox
+try:
+    import thop  # for FLOPs computation
+except ImportError:
+    thop = None
+class Detect(nn.Module):
+    # YOLO Detect head for detection models
+    dynamic = False  # force grid reconstruction
+    export = False  # export mode
+    shape = None
+    anchors = torch.empty(0)  # init
+    strides = torch.empty(0)  # init
+    def __init__(self, nc=80, ch=(), inplace=True):  # detection layer
+        super().__init__()
+        self.nc = nc  # number of classes
+        self.nl = len(ch)  # number of detection layers
+        self.reg_max = 16
+        self.no = nc + self.reg_max * 4  # number of outputs per anchor
+        self.inplace = inplace  # use inplace ops (e.g. slice assignment)
+        self.stride = torch.zeros(self.nl)  # strides computed during build
+        c2, c3 = max((ch[0] // 4, self.reg_max * 4, 16)), max((ch[0], min((self.nc * 2, 128))))  # channels
+        self.cv2 = nn.ModuleList(
+            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
+        self.cv3 = nn.ModuleList(
+            nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
+        self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
+    def forward(self, x):
+        shape = x[0].shape  # BCHW
+        for i in range(self.nl):
+            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
+        if self.training:
+            return x
+        elif self.dynamic or self.shape != shape:
+            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
+            self.shape = shape
+        box, cls = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        y = torch.cat((dbox, cls.sigmoid()), 1)
+        return y if self.export else (y, x)
+    def bias_init(self):
+        # Initialize Detect() biases, WARNING: requires stride availability
+        m = self  # self.model[-1]  # Detect() module
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
+        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
+        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+class DDetect(nn.Module):
+    # YOLO Detect head for detection models
+    dynamic = False  # force grid reconstruction
+    export = False  # export mode
+    shape = None
+    anchors = torch.empty(0)  # init
+    strides = torch.empty(0)  # init
+    def __init__(self, nc=80, ch=(), inplace=True):  # detection layer
+        super().__init__()
+        self.nc = nc  # number of classes
+        self.nl = len(ch)  # number of detection layers
+        self.reg_max = 16
+        self.no = nc + self.reg_max * 4  # number of outputs per anchor
+        self.inplace = inplace  # use inplace ops (e.g. slice assignment)
+        self.stride = torch.zeros(self.nl)  # strides computed during build
+        c2, c3 = make_divisible(max((ch[0] // 4, self.reg_max * 4, 16)), 4), max((ch[0], min((self.nc * 2, 128))))  # channels
+        self.cv2 = nn.ModuleList(
+            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3, g=4), nn.Conv2d(c2, 4 * self.reg_max, 1, groups=4)) for x in ch)
+        self.cv3 = nn.ModuleList(
+            nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
+        self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
+    def forward(self, x):
+        shape = x[0].shape  # BCHW
+        for i in range(self.nl):
+            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
+        if self.training:
+            return x
+        elif self.dynamic or self.shape != shape:
+            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
+            self.shape = shape
+        box, cls = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        y = torch.cat((dbox, cls.sigmoid()), 1)
+        return y if self.export else (y, x)
+    def bias_init(self):
+        # Initialize Detect() biases, WARNING: requires stride availability
+        m = self  # self.model[-1]  # Detect() module
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
+        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
+        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+class DualDetect(nn.Module):
+    # YOLO Detect head for detection models
+    dynamic = False  # force grid reconstruction
+    export = False  # export mode
+    shape = None
+    anchors = torch.empty(0)  # init
+    strides = torch.empty(0)  # init
+    def __init__(self, nc=80, ch=(), inplace=True):  # detection layer
+        super().__init__()
+        self.nc = nc  # number of classes
+        self.nl = len(ch) // 2  # number of detection layers
+        self.reg_max = 16
+        self.no = nc + self.reg_max * 4  # number of outputs per anchor
+        self.inplace = inplace  # use inplace ops (e.g. slice assignment)
+        self.stride = torch.zeros(self.nl)  # strides computed during build
+        c2, c3 = max((ch[0] // 4, self.reg_max * 4, 16)), max((ch[0], min((self.nc * 2, 128))))  # channels
+        c4, c5 = max((ch[self.nl] // 4, self.reg_max * 4, 16)), max((ch[self.nl], min((self.nc * 2, 128))))  # channels
+        self.cv2 = nn.ModuleList(
+            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch[:self.nl])
+        self.cv3 = nn.ModuleList(
+            nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch[:self.nl])
+        self.cv4 = nn.ModuleList(
+            nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, 4 * self.reg_max, 1)) for x in ch[self.nl:])
+        self.cv5 = nn.ModuleList(
+            nn.Sequential(Conv(x, c5, 3), Conv(c5, c5, 3), nn.Conv2d(c5, self.nc, 1)) for x in ch[self.nl:])
+        self.dfl = DFL(self.reg_max)
+        self.dfl2 = DFL(self.reg_max)
+    def forward(self, x):
+        shape = x[0].shape  # BCHW
+        d1 = []
+        d2 = []
+        for i in range(self.nl):
+            d1.append(torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1))
+            d2.append(torch.cat((self.cv4[i](x[self.nl+i]), self.cv5[i](x[self.nl+i])), 1))
+        if self.training:
+            return [d1, d2]
+        elif self.dynamic or self.shape != shape:
+            self.anchors, self.strides = (d1.transpose(0, 1) for d1 in make_anchors(d1, self.stride, 0.5))
+            self.shape = shape
+        box, cls = torch.cat([di.view(shape[0], self.no, -1) for di in d1], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        box2, cls2 = torch.cat([di.view(shape[0], self.no, -1) for di in d2], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox2 = dist2bbox(self.dfl2(box2), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        y = [torch.cat((dbox, cls.sigmoid()), 1), torch.cat((dbox2, cls2.sigmoid()), 1)]
+        return y if self.export else (y, [d1, d2])
+    def bias_init(self):
+        # Initialize Detect() biases, WARNING: requires stride availability
+        m = self  # self.model[-1]  # Detect() module
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
+        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
+        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+        for a, b, s in zip(m.cv4, m.cv5, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+class DualDDetect(nn.Module):
+    # YOLO Detect head for detection models
+    dynamic = False  # force grid reconstruction
+    export = False  # export mode
+    shape = None
+    anchors = torch.empty(0)  # init
+    strides = torch.empty(0)  # init
+    def __init__(self, nc=80, ch=(), inplace=True):  # detection layer
+        super().__init__()
+        self.nc = nc  # number of classes
+        self.nl = len(ch) // 2  # number of detection layers
+        self.reg_max = 16
+        self.no = nc + self.reg_max * 4  # number of outputs per anchor
+        self.inplace = inplace  # use inplace ops (e.g. slice assignment)
+        self.stride = torch.zeros(self.nl)  # strides computed during build
+        c2, c3 = make_divisible(max((ch[0] // 4, self.reg_max * 4, 16)), 4), max((ch[0], min((self.nc * 2, 128))))  # channels
+        c4, c5 = make_divisible(max((ch[self.nl] // 4, self.reg_max * 4, 16)), 4), max((ch[self.nl], min((self.nc * 2, 128))))  # channels
+        self.cv2 = nn.ModuleList(
+            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3, g=4), nn.Conv2d(c2, 4 * self.reg_max, 1, groups=4)) for x in ch[:self.nl])
+        self.cv3 = nn.ModuleList(
+            nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch[:self.nl])
+        self.cv4 = nn.ModuleList(
+            nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3, g=4), nn.Conv2d(c4, 4 * self.reg_max, 1, groups=4)) for x in ch[self.nl:])
+        self.cv5 = nn.ModuleList(
+            nn.Sequential(Conv(x, c5, 3), Conv(c5, c5, 3), nn.Conv2d(c5, self.nc, 1)) for x in ch[self.nl:])
+        self.dfl = DFL(self.reg_max)
+        self.dfl2 = DFL(self.reg_max)
+    def forward(self, x):
+        shape = x[0].shape  # BCHW
+        d1 = []
+        d2 = []
+        for i in range(self.nl):
+            d1.append(torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1))
+            d2.append(torch.cat((self.cv4[i](x[self.nl+i]), self.cv5[i](x[self.nl+i])), 1))
+        if self.training:
+            return [d1, d2]
+        elif self.dynamic or self.shape != shape:
+            self.anchors, self.strides = (d1.transpose(0, 1) for d1 in make_anchors(d1, self.stride, 0.5))
+            self.shape = shape
+        box, cls = torch.cat([di.view(shape[0], self.no, -1) for di in d1], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        box2, cls2 = torch.cat([di.view(shape[0], self.no, -1) for di in d2], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox2 = dist2bbox(self.dfl2(box2), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        y = [torch.cat((dbox, cls.sigmoid()), 1), torch.cat((dbox2, cls2.sigmoid()), 1)]
+        return y if self.export else (y, [d1, d2])
+        #y = torch.cat((dbox2, cls2.sigmoid()), 1)
+        #return y if self.export else (y, d2)
+        #y1 = torch.cat((dbox, cls.sigmoid()), 1)
+        #y2 = torch.cat((dbox2, cls2.sigmoid()), 1)
+        #return [y1, y2] if self.export else [(y1, d1), (y2, d2)]
+        #return [y1, y2] if self.export else [(y1, y2), (d1, d2)]
+    def bias_init(self):
+        # Initialize Detect() biases, WARNING: requires stride availability
+        m = self  # self.model[-1]  # Detect() module
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
+        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
+        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+        for a, b, s in zip(m.cv4, m.cv5, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+class TripleDetect(nn.Module):
+    # YOLO Detect head for detection models
+    dynamic = False  # force grid reconstruction
+    export = False  # export mode
+    shape = None
+    anchors = torch.empty(0)  # init
+    strides = torch.empty(0)  # init
+    def __init__(self, nc=80, ch=(), inplace=True):  # detection layer
+        super().__init__()
+        self.nc = nc  # number of classes
+        self.nl = len(ch) // 3  # number of detection layers
+        self.reg_max = 16
+        self.no = nc + self.reg_max * 4  # number of outputs per anchor
+        self.inplace = inplace  # use inplace ops (e.g. slice assignment)
+        self.stride = torch.zeros(self.nl)  # strides computed during build
+        c2, c3 = max((ch[0] // 4, self.reg_max * 4, 16)), max((ch[0], min((self.nc * 2, 128))))  # channels
+        c4, c5 = max((ch[self.nl] // 4, self.reg_max * 4, 16)), max((ch[self.nl], min((self.nc * 2, 128))))  # channels
+        c6, c7 = max((ch[self.nl * 2] // 4, self.reg_max * 4, 16)), max((ch[self.nl * 2], min((self.nc * 2, 128))))  # channels
+        self.cv2 = nn.ModuleList(
+            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch[:self.nl])
+        self.cv3 = nn.ModuleList(
+            nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch[:self.nl])
+        self.cv4 = nn.ModuleList(
+            nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, 4 * self.reg_max, 1)) for x in ch[self.nl:self.nl*2])
+        self.cv5 = nn.ModuleList(
+            nn.Sequential(Conv(x, c5, 3), Conv(c5, c5, 3), nn.Conv2d(c5, self.nc, 1)) for x in ch[self.nl:self.nl*2])
+        self.cv6 = nn.ModuleList(
+            nn.Sequential(Conv(x, c6, 3), Conv(c6, c6, 3), nn.Conv2d(c6, 4 * self.reg_max, 1)) for x in ch[self.nl*2:self.nl*3])
+        self.cv7 = nn.ModuleList(
+            nn.Sequential(Conv(x, c7, 3), Conv(c7, c7, 3), nn.Conv2d(c7, self.nc, 1)) for x in ch[self.nl*2:self.nl*3])
+        self.dfl = DFL(self.reg_max)
+        self.dfl2 = DFL(self.reg_max)
+        self.dfl3 = DFL(self.reg_max)
+    def forward(self, x):
+        shape = x[0].shape  # BCHW
+        d1 = []
+        d2 = []
+        d3 = []
+        for i in range(self.nl):
+            d1.append(torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1))
+            d2.append(torch.cat((self.cv4[i](x[self.nl+i]), self.cv5[i](x[self.nl+i])), 1))
+            d3.append(torch.cat((self.cv6[i](x[self.nl*2+i]), self.cv7[i](x[self.nl*2+i])), 1))
+        if self.training:
+            return [d1, d2, d3]
+        elif self.dynamic or self.shape != shape:
+            self.anchors, self.strides = (d1.transpose(0, 1) for d1 in make_anchors(d1, self.stride, 0.5))
+            self.shape = shape
+        box, cls = torch.cat([di.view(shape[0], self.no, -1) for di in d1], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        box2, cls2 = torch.cat([di.view(shape[0], self.no, -1) for di in d2], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox2 = dist2bbox(self.dfl2(box2), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        box3, cls3 = torch.cat([di.view(shape[0], self.no, -1) for di in d3], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox3 = dist2bbox(self.dfl3(box3), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        y = [torch.cat((dbox, cls.sigmoid()), 1), torch.cat((dbox2, cls2.sigmoid()), 1), torch.cat((dbox3, cls3.sigmoid()), 1)]
+        return y if self.export else (y, [d1, d2, d3])
+    def bias_init(self):
+        # Initialize Detect() biases, WARNING: requires stride availability
+        m = self  # self.model[-1]  # Detect() module
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
+        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
+        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+        for a, b, s in zip(m.cv4, m.cv5, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+        for a, b, s in zip(m.cv6, m.cv7, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+class TripleDDetect(nn.Module):
+    # YOLO Detect head for detection models
+    dynamic = False  # force grid reconstruction
+    export = False  # export mode
+    shape = None
+    anchors = torch.empty(0)  # init
+    strides = torch.empty(0)  # init
+    def __init__(self, nc=80, ch=(), inplace=True):  # detection layer
+        super().__init__()
+        self.nc = nc  # number of classes
+        self.nl = len(ch) // 3  # number of detection layers
+        self.reg_max = 16
+        self.no = nc + self.reg_max * 4  # number of outputs per anchor
+        self.inplace = inplace  # use inplace ops (e.g. slice assignment)
+        self.stride = torch.zeros(self.nl)  # strides computed during build
+        c2, c3 = make_divisible(max((ch[0] // 4, self.reg_max * 4, 16)), 4), \
+                                max((ch[0], min((self.nc * 2, 128))))  # channels
+        c4, c5 = make_divisible(max((ch[self.nl] // 4, self.reg_max * 4, 16)), 4), \
+                                max((ch[self.nl], min((self.nc * 2, 128))))  # channels
+        c6, c7 = make_divisible(max((ch[self.nl * 2] // 4, self.reg_max * 4, 16)), 4), \
+                                max((ch[self.nl * 2], min((self.nc * 2, 128))))  # channels
+        self.cv2 = nn.ModuleList(
+            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3, g=4),
+                          nn.Conv2d(c2, 4 * self.reg_max, 1, groups=4)) for x in ch[:self.nl])
+        self.cv3 = nn.ModuleList(
+            nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch[:self.nl])
+        self.cv4 = nn.ModuleList(
+            nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3, g=4),
+                          nn.Conv2d(c4, 4 * self.reg_max, 1, groups=4)) for x in ch[self.nl:self.nl*2])
+        self.cv5 = nn.ModuleList(
+            nn.Sequential(Conv(x, c5, 3), Conv(c5, c5, 3), nn.Conv2d(c5, self.nc, 1)) for x in ch[self.nl:self.nl*2])
+        self.cv6 = nn.ModuleList(
+            nn.Sequential(Conv(x, c6, 3), Conv(c6, c6, 3, g=4),
+                          nn.Conv2d(c6, 4 * self.reg_max, 1, groups=4)) for x in ch[self.nl*2:self.nl*3])
+        self.cv7 = nn.ModuleList(
+            nn.Sequential(Conv(x, c7, 3), Conv(c7, c7, 3), nn.Conv2d(c7, self.nc, 1)) for x in ch[self.nl*2:self.nl*3])
+        self.dfl = DFL(self.reg_max)
+        self.dfl2 = DFL(self.reg_max)
+        self.dfl3 = DFL(self.reg_max)
+    def forward(self, x):
+        shape = x[0].shape  # BCHW
+        d1 = []
+        d2 = []
+        d3 = []
+        for i in range(self.nl):
+            d1.append(torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1))
+            d2.append(torch.cat((self.cv4[i](x[self.nl+i]), self.cv5[i](x[self.nl+i])), 1))
+            d3.append(torch.cat((self.cv6[i](x[self.nl*2+i]), self.cv7[i](x[self.nl*2+i])), 1))
+        if self.training:
+            return [d1, d2, d3]
+        elif self.dynamic or self.shape != shape:
+            self.anchors, self.strides = (d1.transpose(0, 1) for d1 in make_anchors(d1, self.stride, 0.5))
+            self.shape = shape
+        box, cls = torch.cat([di.view(shape[0], self.no, -1) for di in d1], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        box2, cls2 = torch.cat([di.view(shape[0], self.no, -1) for di in d2], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox2 = dist2bbox(self.dfl2(box2), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        box3, cls3 = torch.cat([di.view(shape[0], self.no, -1) for di in d3], 2).split((self.reg_max * 4, self.nc), 1)
+        dbox3 = dist2bbox(self.dfl3(box3), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        #y = [torch.cat((dbox, cls.sigmoid()), 1), torch.cat((dbox2, cls2.sigmoid()), 1), torch.cat((dbox3, cls3.sigmoid()), 1)]
+        #return y if self.export else (y, [d1, d2, d3])
+        y = torch.cat((dbox3, cls3.sigmoid()), 1)
+        return y if self.export else (y, d3)
+    def bias_init(self):
+        # Initialize Detect() biases, WARNING: requires stride availability
+        m = self  # self.model[-1]  # Detect() module
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
+        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
+        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+        for a, b, s in zip(m.cv4, m.cv5, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+        for a, b, s in zip(m.cv6, m.cv7, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (5 objects and 80 classes per 640 image)
+class Segment(Detect):
+    # YOLO Segment head for segmentation models
+    def __init__(self, nc=80, nm=32, npr=256, ch=(), inplace=True):
+        super().__init__(nc, ch, inplace)
+        self.nm = nm  # number of masks
+        self.npr = npr  # number of protos
+        self.proto = Proto(ch[0], self.npr, self.nm)  # protos
+        self.detect = Detect.forward
+        c4 = max(ch[0] // 4, self.nm)
+        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)
+    def forward(self, x):
+        p = self.proto(x[0])
+        bs = p.shape[0]
+        mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2)  # mask coefficients
+        x = self.detect(self, x)
+        if self.training:
+            return x, mc, p
+        return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
+class Panoptic(Detect):
+    # YOLO Panoptic head for panoptic segmentation models
+    def __init__(self, nc=80, sem_nc=93, nm=32, npr=256, ch=(), inplace=True):
+        super().__init__(nc, ch, inplace)
+        self.sem_nc = sem_nc
+        self.nm = nm  # number of masks
+        self.npr = npr  # number of protos
+        self.proto = Proto(ch[0], self.npr, self.nm)  # protos
+        self.uconv = UConv(ch[0], ch[0]//4, self.sem_nc+self.nc)
+        self.detect = Detect.forward
+        c4 = max(ch[0] // 4, self.nm)
+        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)
+    def forward(self, x):
+        p = self.proto(x[0])
+        s = self.uconv(x[0])
+        bs = p.shape[0]
+        mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2)  # mask coefficients
+        x = self.detect(self, x)
+        if self.training:
+            return x, mc, p, s
+        return (torch.cat([x, mc], 1), p, s) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p, s))
+class BaseModel(nn.Module):
+    # YOLO base model
+    def forward(self, x, profile=False, visualize=False):
+        return self._forward_once(x, profile, visualize)  # single-scale inference, train
+    def _forward_once(self, x, profile=False, visualize=False):
+        y, dt = [], []  # outputs
+        for m in self.model:
+            if m.f != -1:  # if not from previous layer
+                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
+            if profile:
+                self._profile_one_layer(m, x, dt)
+            #print(m)
+            x = m(x)  # run
+            y.append(x if m.i in self.save else None)  # save output
+            if visualize:
+                feature_visualization(x, m.type, m.i, save_dir=visualize)
+        return x
+    def _profile_one_layer(self, m, x, dt):
+        c = m == self.model[-1]  # is final layer, copy input as inplace fix
+        o = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1E9 * 2 if thop else 0  # FLOPs
+        t = time_sync()
+        for _ in range(10):
+            m(x.copy() if c else x)
+        dt.append((time_sync() - t) * 100)
+        if m == self.model[0]:
+            LOGGER.info(f"{'time (ms)':>10s} {'GFLOPs':>10s} {'params':>10s}  module")
+        LOGGER.info(f'{dt[-1]:10.2f} {o:10.2f} {m.np:10.0f}  {m.type}')
+        if c:
+            LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s}  Total")
+    def fuse(self):  # fuse model Conv2d() + BatchNorm2d() layers
+        LOGGER.info('Fusing layers... ')
+        for m in self.model.modules():
+            if isinstance(m, (RepConvN)) and hasattr(m, 'fuse_convs'):
+                m.fuse_convs()
+                m.forward = m.forward_fuse  # update forward
+            if isinstance(m, (Conv, DWConv)) and hasattr(m, 'bn'):
+                m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
+                delattr(m, 'bn')  # remove batchnorm
+                m.forward = m.forward_fuse  # update forward
+        self.info()
+        return self
+    def info(self, verbose=False, img_size=640):  # print model information
+        model_info(self, verbose, img_size)
+    def _apply(self, fn):
+        # Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers
+        self = super()._apply(fn)
+        m = self.model[-1]  # Detect()
+        if isinstance(m, (Detect, DualDetect, TripleDetect, DDetect, DualDDetect, TripleDDetect, Segment, Panoptic)):
+            m.stride = fn(m.stride)
+            m.anchors = fn(m.anchors)
+            m.strides = fn(m.strides)
+            # m.grid = list(map(fn, m.grid))
+        return self
+class DetectionModel(BaseModel):
+    # YOLO detection model
+    def __init__(self, cfg='yolo.yaml', ch=3, nc=None, anchors=None):  # model, input channels, number of classes
+        super().__init__()
+        if isinstance(cfg, dict):
+            self.yaml = cfg  # model dict
+        else:  # is *.yaml
+            import yaml  # for torch hub
+            self.yaml_file = Path(cfg).name
+            with open(cfg, encoding='ascii', errors='ignore') as f:
+                self.yaml = yaml.safe_load(f)  # model dict
+        # Define model
+        ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
+        if nc and nc != self.yaml['nc']:
+            LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
+            self.yaml['nc'] = nc  # override yaml value
+        if anchors:
+            LOGGER.info(f'Overriding model.yaml anchors with anchors={anchors}')
+            self.yaml['anchors'] = round(anchors)  # override yaml value
+        self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])  # model, savelist
+        self.names = [str(i) for i in range(self.yaml['nc'])]  # default names
+        self.inplace = self.yaml.get('inplace', True)
+        # Build strides, anchors
+        m = self.model[-1]  # Detect()
+        if isinstance(m, (Detect, DDetect, Segment, Panoptic)):
+            s = 256  # 2x min stride
+            m.inplace = self.inplace
+            forward = lambda x: self.forward(x)[0] if isinstance(m, (Segment, Panoptic)) else self.forward(x)
+            m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))])  # forward
+            # check_anchor_order(m)
+            # m.anchors /= m.stride.view(-1, 1, 1)
+            self.stride = m.stride
+            m.bias_init()  # only run once
+        if isinstance(m, (DualDetect, TripleDetect, DualDDetect, TripleDDetect)):
+            s = 256  # 2x min stride
+            m.inplace = self.inplace
+            #forward = lambda x: self.forward(x)[0][0] if isinstance(m, (DualSegment, DualPanoptic)) else self.forward(x)[0]
+            forward = lambda x: self.forward(x)[0]
+            m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))])  # forward
+            # check_anchor_order(m)
+            # m.anchors /= m.stride.view(-1, 1, 1)
+            self.stride = m.stride
+            m.bias_init()  # only run once
+        # Init weights, biases
+        initialize_weights(self)
+        self.info()
+        LOGGER.info('')
+    def forward(self, x, augment=False, profile=False, visualize=False):
+        if augment:
+            return self._forward_augment(x)  # augmented inference, None
+        return self._forward_once(x, profile, visualize)  # single-scale inference, train
+    def _forward_augment(self, x):
+        img_size = x.shape[-2:]  # height, width
+        s = [1, 0.83, 0.67]  # scales
+        f = [None, 3, None]  # flips (2-ud, 3-lr)
+        y = []  # outputs
+        for si, fi in zip(s, f):
+            xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
+            yi = self._forward_once(xi)[0]  # forward
+            # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1])  # save
+            yi = self._descale_pred(yi, fi, si, img_size)
+            y.append(yi)
+        y = self._clip_augmented(y)  # clip augmented tails
+        return torch.cat(y, 1), None  # augmented inference, train
+    def _descale_pred(self, p, flips, scale, img_size):
+        # de-scale predictions following augmented inference (inverse operation)
+        if self.inplace:
+            p[..., :4] /= scale  # de-scale
+            if flips == 2:
+                p[..., 1] = img_size[0] - p[..., 1]  # de-flip ud
+            elif flips == 3:
+                p[..., 0] = img_size[1] - p[..., 0]  # de-flip lr
+        else:
+            x, y, wh = p[..., 0:1] / scale, p[..., 1:2] / scale, p[..., 2:4] / scale  # de-scale
+            if flips == 2:
+                y = img_size[0] - y  # de-flip ud
+            elif flips == 3:
+                x = img_size[1] - x  # de-flip lr
+            p = torch.cat((x, y, wh, p[..., 4:]), -1)
+        return p
+    def _clip_augmented(self, y):
+        # Clip YOLO augmented inference tails
+        nl = self.model[-1].nl  # number of detection layers (P3-P5)
+        g = sum(4 ** x for x in range(nl))  # grid points
+        e = 1  # exclude layer count
+        i = (y[0].shape[1] // g) * sum(4 ** x for x in range(e))  # indices
+        y[0] = y[0][:, :-i]  # large
+        i = (y[-1].shape[1] // g) * sum(4 ** (nl - 1 - x) for x in range(e))  # indices
+        y[-1] = y[-1][:, i:]  # small
+        return y
+Model = DetectionModel  # retain YOLO 'Model' class for backwards compatibility
+class SegmentationModel(DetectionModel):
+    # YOLO segmentation model
+    def __init__(self, cfg='yolo-seg.yaml', ch=3, nc=None, anchors=None):
+        super().__init__(cfg, ch, nc, anchors)
+class ClassificationModel(BaseModel):
+    # YOLO classification model
+    def __init__(self, cfg=None, model=None, nc=1000, cutoff=10):  # yaml, model, number of classes, cutoff index
+        super().__init__()
+        self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg)
+    def _from_detection_model(self, model, nc=1000, cutoff=10):
+        # Create a YOLO classification model from a YOLO detection model
+        if isinstance(model, DetectMultiBackend):
+            model = model.model  # unwrap DetectMultiBackend
+        model.model = model.model[:cutoff]  # backbone
+        m = model.model[-1]  # last layer
+        ch = m.conv.in_channels if hasattr(m, 'conv') else m.cv1.conv.in_channels  # ch into module
+        c = Classify(ch, nc)  # Classify()
+        c.i, c.f, c.type = m.i, m.f, 'models.common.Classify'  # index, from, type
+        model.model[-1] = c  # replace
+        self.model = model.model
+        self.stride = model.stride
+        self.save = []
+        self.nc = nc
+    def _from_yaml(self, cfg):
+        # Create a YOLO classification model from a *.yaml file
+        self.model = None
+def parse_model(d, ch):  # model_dict, input_channels(3)
+    # Parse a YOLO model.yaml dictionary
+    LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10}  {'module':<40}{'arguments':<30}")
+    anchors, nc, gd, gw, act = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'], d.get('activation')
+    if act:
+        Conv.default_act = eval(act)  # redefine default activation, i.e. Conv.default_act = nn.SiLU()
+        RepConvN.default_act = eval(act)  # redefine default activation, i.e. Conv.default_act = nn.SiLU()
+        LOGGER.info(f"{colorstr('activation:')} {act}")  # print
+    na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # number of anchors
+    no = na * (nc + 5)  # number of outputs = anchors * (classes + 5)
+    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
+    for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
+        m = eval(m) if isinstance(m, str) else m  # eval strings
+        for j, a in enumerate(args):
+            with contextlib.suppress(NameError):
+                args[j] = eval(a) if isinstance(a, str) else a  # eval strings
+        n = n_ = max(round(n * gd), 1) if n > 1 else n  # depth gain
+        if m in {
+            Conv, AConv, ConvTranspose,
+            Bottleneck, SPP, SPPF, DWConv, BottleneckCSP, nn.ConvTranspose2d, DWConvTranspose2d, SPPCSPC, ADown,
+            RepNCSPELAN4, SPPELAN}:
+            c1, c2 = ch[f], args[0]
+            if c2 != no:  # if not output
+                c2 = make_divisible(c2 * gw, 8)
+            args = [c1, c2, *args[1:]]
+            if m in {BottleneckCSP, SPPCSPC}:
+                args.insert(2, n)  # number of repeats
+                n = 1
+        elif m is nn.BatchNorm2d:
+            args = [ch[f]]
+        elif m in [Down0,Down1,Down2,Down3,Down4]:
+            c2 = args[0]
+        elif m is Concat:
+            c2 = sum(ch[x] for x in f)
+        elif m is Shortcut:
+            c2 = ch[f[0]]
+        elif m is ReOrg:
+            c2 = ch[f] * 4
+        elif m is CBLinear:
+            c2 = args[0]
+            c1 = ch[f]
+            args = [c1, c2, *args[1:]]
+        elif m is CBFuse:
+            c2 = ch[f[-1]]
+        # TODO: channel, gw, gd
+        elif m in {Detect, DualDetect, TripleDetect, DDetect, DualDDetect, TripleDDetect, Segment, Panoptic}:
+            args.append([ch[x] for x in f])
+            # if isinstance(args[1], int):  # number of anchors
+            #     args[1] = [list(range(args[1] * 2))] * len(f)
+            if m in {Segment, Panoptic}:
+                args[2] = make_divisible(args[2] * gw, 8)
+        elif m is Contract:
+            c2 = ch[f] * args[0] ** 2
+        elif m is Expand:
+            c2 = ch[f] // args[0] ** 2
+        else:
+            c2 = ch[f]
+        m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module
+        t = str(m)[8:-2].replace('__main__.', '')  # module type
+        np = sum(x.numel() for x in m_.parameters())  # number params
+        m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
+        LOGGER.info(f'{i:>3}{str(f):>18}{n_:>3}{np:10.0f}  {t:<40}{str(args):<30}')  # print
+        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
+        layers.append(m_)
+        if i == 0:
+            ch = []
+        ch.append(c2)
+    return nn.Sequential(*layers), sorted(save)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cfg', type=str, default='yolo.yaml', help='model.yaml')
+    parser.add_argument('--batch-size', type=int, default=1, help='total batch size for all GPUs')
+    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
+    parser.add_argument('--profile', action='store_true', help='profile model speed')
+    parser.add_argument('--line-profile', action='store_true', help='profile model speed layer by layer')
+    parser.add_argument('--test', action='store_true', help='test all yolo*.yaml')
+    opt = parser.parse_args()
+    opt.cfg = check_yaml(opt.cfg)  # check YAML
+    print_args(vars(opt))
+    device = select_device(opt.device)
+    # Create model
+    im = torch.rand(opt.batch_size, 3, 640, 640).to(device)
+    model = Model(opt.cfg).to(device)
+    model.eval()
+    # Options
+    if opt.line_profile:  # profile layer by layer
+        model(im, profile=True)
+    elif opt.profile:  # profile forward-backward
+        results = profile(input=im, ops=[model], n=3)
+    elif opt.test:  # test all models
+        for cfg in Path(ROOT / 'models').rglob('yolo*.yaml'):
+            try:
+                _ = Model(cfg)
+            except Exception as e:
+                print(f'Error in {cfg}: {e}')
+    else:  # report fused model summary
+        model.fuse()

spark repvit/repvit_1kpretrained_timm_style.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89abdcdc1a2865f96822bb61e3096159087c6f5d331961dd1fed8e0a9c58988e
+size 269763237

spark/downstream_d2/README.md ADDED Viewed

	@@ -0,0 +1,101 @@

+## About code isolation
+This `downstream_d2` is isolated from pre-training codes. One can treat this `downstream_d2` as an independent codebase 🛠️.
+## Fine-tuned ResNet-50 weights, log files, and performance
+<div align="center">
+  [[`weights (pre-trained by SparK)`](https://drive.google.com/file/d/1H8605HbxGvrsu4x4rIoNr-Wkd7JkxFPQ/view?usp=share_link)]
+  [[`weights (fine-tuned on COCO)`](https://drive.google.com/file/d/1Ue7SiQ1E_AwgtYo56Fm-iUlQPZ8vIwYj/view?usp=share_link)]
+  [[`metrics.json`](https://drive.google.com/file/d/1wfbUWh4svV8sPWya_0PAhsLHVayDQRCi/view?usp=share_link)]
+  [[`log.txt`](https://drive.google.com/file/d/11zVo_87pe9DMAmfNQK9FUfyjQWHTRKxV/view?usp=share_link)]
+  [[`tensorboard file`](https://drive.google.com/file/d/1aM1qj8c3-Uka1dZuYmKhgp1lNJpeMDMl/view?usp=share_link)]
+</div>
+<p align="center">
+<img src="https://user-images.githubusercontent.com/39692511/211497479-0563e891-f2ad-4cf1-b682-a21c2be1442d.png" width=80%>
+<p>
+## Installation [Detectron2 v0.6](https://github.com/facebookresearch/detectron2/releases/tag/v0.6) before fine-tuning ResNet on COCO
+1. Let you in some python environment, e.g.:
+```shell script
+$ conda create -n spark python=3.8 -y
+$ conda activate spark
+```
+2. Install `detectron2==0.6` (e.g., with `torch==1.10.0` and `cuda11.3`):
+```shell script
+$ pip install detectron2==0.6 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html
+```
+You can also find instructions for different pytorch/cuda versions on [this page](https://github.com/facebookresearch/detectron2/releases/tag/v0.6).
+3. Put the COCO dataset folder at `downstream_d2/datasets/coco`.
+The folder should follow the [directory structure](https://github.com/facebookresearch/detectron2/tree/master/datasets) requried by `Detectron2`, which should look like this:
+```
+downstream_d2/datasets/coco:
+    annotations/:
+        captions_train2017.json  captions_val2017.json
+        instances_train2017.json  instances_val2017.json
+        person_keypoints_train2017.json  person_keypoints_val2017.json
+    train2017/:
+        a_lot_images.jpg
+    val2017/:
+        a_lot_images.jpg
+```
+## Training from pre-trained checkpoint
+The script file for COCO fine-tuning (object detection and instance segmentation) is [downstream_d2/train_net.py](https://github.com/keyu-tian/SparK/blob/main/downstream_d2/train_net.py),
+which is a modification of [Detectron2's tools/train_net.py](https://github.com/facebookresearch/detectron2/blob/v0.6/tools/train_net.py).
+Before fine-tuning a ResNet50 pre-trained by SparK, you should first convert our checkpoint file to Detectron2-style `.pkl` file:
+```shell script
+$ cd /path/to/SparK/downstream_d2
+$ python3 convert-timm-to-d2.py /some/path/to/resnet50_1kpretrained_timm_style.pth d2-style.pkl
+```
+For a ResNet50, you should see a log reporting `len(state)==318`:
+```text
+[convert] .pkl is generated! (from `/some/path/to/resnet50_1kpretrained_timm_style.pth`, to `d2-style.pkl`, len(state)==318)
+```
+Then run fine-tuning on single machine with 8 gpus:
+```shell script
+$ cd /path/to/SparK/downstream_d2
+$ python3 ./train_net.py --resume --num-gpus 8 --config-file ./configs/coco_R_50_FPN_CONV_1x_moco_adam.yaml \
+  MODEL.WEIGHTS d2-style.pkl \
+  OUTPUT_DIR <your_output_dir>
+```
+For multiple machines, plus these args:
+```shell script
+--num-machines <total_num> --machine-rank <this_rank> --dist-url <url:port>
+```
+In `<your_output_dir>` you'll see the log files generated by `Detectron2`.
+## Details: how we modify the official Detectron2's [tools/train_net.py](https://github.com/facebookresearch/detectron2/blob/v0.6/tools/train_net.py) to get our [downstream_d2/train_net.py](https://github.com/keyu-tian/SparK/blob/main/downstream_d2/train_net.py)
+1. We add two new hyperparameters:
+    - str `SOLVER.OPTIMIZER`: use 'ADAM' (the same as 'ADAMW') or 'SGD' optimizer
+    - float `SOLVER.LR_DECAY`: the decay ratio (from 0. to 1.) of layer-wise learning rate decay trick
+2. We implement layer-wise lr decay in [downstream_d2/lr_decay.py](https://github.com/keyu-tian/SparK/blob/main/downstream_d2/lr_decay.py).
+3. We write a script to convert our timm-style pre-trained ResNet weights to Detectron2-style in [downstream_d2/convert-timm-to-d2.py](https://github.com/keyu-tian/SparK/blob/main/downstream_d2/convert-timm-to-d2.py).
+4. We also add a hook for logging results to `cfg.OUTPUT_DIR/d2_coco_log.txt`.
+All of our modifications to the original are commented with `# [modification] ...` in [downstream_d2/train_net.py](https://github.com/keyu-tian/SparK/blob/main/downstream_d2/train_net.py) or other files.

spark/downstream_d2/configs/Base-RCNN-FPN.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2

spark/downstream_d2/configs/coco_R_50_FPN_CONV_1x_moco_adam.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+_BASE_: "Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "<see instructions>"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: True
+  BACKBONE:
+    FREEZE_AT: 0
+  RESNETS:
+    DEPTH: 50
+    NORM: "SyncBN"
+    STRIDE_IN_1X1: False
+  FPN:
+    NORM: "SyncBN"
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 1
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 7
+    NORM: "SyncBN"
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+    NORM: "SyncBN"
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896)
+  CROP:
+    ENABLED: False
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+TEST:
+  EVAL_PERIOD: 5000
+  PRECISE_BN:
+    ENABLED: True
+SOLVER:
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  GAMMA: 0.25
+  BASE_LR: 0.00025
+  WARMUP_FACTOR: 0.01
+  WARMUP_ITERS: 1000
+  WEIGHT_DECAY: 0.0001
+  CHECKPOINT_PERIOD: 5000
+  CLIP_GRADIENTS:
+    ENABLED: False
+    CLIP_TYPE: "value"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+  # compared to standard detectron2, we add these two new configurations:
+  OPTIMIZER: "ADAMW"
+  LR_DECAY: 0.6

spark/downstream_d2/convert-timm-to-d2.py ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/usr/bin/python3
+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import pickle as pkl
+import torch
+# we use `timm.models.ResNet` in pre-training, so keys are timm-style
+def timm_resnet_to_detectron2_resnet(source_file, target_file):
+    pretrained: dict = torch.load(source_file, map_location='cpu')
+    for mod_k in {'state_dict', 'state', 'module', 'model'}:
+        if mod_k in pretrained:
+            pretrained = pretrained[mod_k]
+    if any(k.startswith('module.encoder_q.') for k in pretrained.keys()):
+        pretrained = {k.replace('module.encoder_q.', ''): v for k, v in pretrained.items() if k.startswith('module.encoder_q.')}
+    pkl_state = {}
+    for k, v in pretrained.items(): # convert resnet's keys from timm-style to d2-style
+        if 'layer' not in k:
+            k = 'stem.' + k
+        for t in [1, 2, 3, 4]:
+            k = k.replace(f'layer{t}', f'res{t+1}')
+        for t in [1, 2, 3]:
+            k = k.replace(f'bn{t}', f'conv{t}.norm')
+        k = k.replace('downsample.0', 'shortcut')
+        k = k.replace('downsample.1', 'shortcut.norm')
+        pkl_state[k] = v.detach().numpy()
+    with open(target_file, 'wb') as fp:
+        print(f'[convert] .pkl is generated! (from `{source_file}`, to `{target_file}`, len(state)=={len(pkl_state)})')
+        pkl.dump({'model': pkl_state, '__author__': 'https://github.com/keyu-tian/SparK', 'matching_heuristics': True}, fp)
+if __name__ == '__main__':
+    import sys
+    timm_resnet_to_detectron2_resnet(sys.argv[1], sys.argv[2])

spark/downstream_d2/lr_decay.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from typing import List, Dict, Set, Optional, Callable, Any
+import torch
+import copy
+from detectron2.solver.build import reduce_param_groups
+def lr_factor_func(para_name: str, is_resnet50, dec: float, debug=False) -> float:
+    if dec == 0:
+        dec = 1.
+    N = 5 if is_resnet50 else 11
+    if '.stem.' in para_name:
+        layer_id = 0
+    elif '.res' in para_name:
+        ls = para_name.split('.res')[1].split('.')
+        if ls[0].isnumeric() and ls[1].isnumeric():
+            stage_id, block_id = int(ls[0]), int(ls[1])
+            if stage_id == 2:  # res2
+                layer_id = 1
+            elif stage_id == 3:  # res3
+                layer_id = 2
+            elif stage_id == 4:  # res4
+                layer_id = 3 + block_id // 3  # 3, 4  or  4, 5
+            else:  # res5
+                layer_id = N
+        else:
+            assert para_name.startswith('roi_heads.res5.norm.')
+            layer_id = N + 1  # roi_heads.res5.norm.weight and roi_heads.res5.norm.bias of C4
+    else:
+        layer_id = N + 1
+    exp = N + 1 - layer_id
+    return f'{dec:g} ** {exp}' if debug else dec ** exp
+# [modification] see: https://github.com/facebookresearch/detectron2/blob/v0.6/detectron2/solver/build.py#L134
+# add the `lr_factor_func` to implement lr decay
+def get_default_optimizer_params(
+        model: torch.nn.Module,
+        base_lr: Optional[float] = None,
+        weight_decay: Optional[float] = None,
+        weight_decay_norm: Optional[float] = None,
+        bias_lr_factor: Optional[float] = 1.0,
+        weight_decay_bias: Optional[float] = None,
+        lr_factor_func: Optional[Callable] = None,
+        overrides: Optional[Dict[str, Dict[str, float]]] = None,
+) -> List[Dict[str, Any]]:
+    """
+    Get default param list for optimizer, with support for a few types of
+    overrides. If no overrides needed, this is equivalent to `model.parameters()`.
+    Args:
+        base_lr: lr for every group by default. Can be omitted to use the one in optimizer.
+        weight_decay: weight decay for every group by default. Can be omitted to use the one
+            in optimizer.
+        weight_decay_norm: override weight decay for params in normalization layers
+        bias_lr_factor: multiplier of lr for bias parameters.
+        weight_decay_bias: override weight decay for bias parameters.
+        lr_factor_func: function to calculate lr decay rate by mapping the parameter names to
+            corresponding lr decay rate. Note that setting this option requires
+            also setting ``base_lr``.
+        overrides: if not `None`, provides values for optimizer hyperparameters
+            (LR, weight decay) for module parameters with a given name; e.g.
+            ``{"embedding": {"lr": 0.01, "weight_decay": 0.1}}`` will set the LR and
+            weight decay values for all module parameters named `embedding`.
+    For common detection models, ``weight_decay_norm`` is the only option
+    needed to be set. ``bias_lr_factor,weight_decay_bias`` are legacy settings
+    from Detectron1 that are not found useful.
+    Example:
+    ::
+        torch.optim.SGD(get_default_optimizer_params(model, weight_decay_norm=0),
+                       lr=0.01, weight_decay=1e-4, momentum=0.9)
+    """
+    if overrides is None:
+        overrides = {}
+    defaults = {}
+    if base_lr is not None:
+        defaults["lr"] = base_lr
+    if weight_decay is not None:
+        defaults["weight_decay"] = weight_decay
+    bias_overrides = {}
+    if bias_lr_factor is not None and bias_lr_factor != 1.0:
+        # NOTE: unlike Detectron v1, we now by default make bias hyperparameters
+        # exactly the same as regular weights.
+        if base_lr is None:
+            raise ValueError("bias_lr_factor requires base_lr")
+        bias_overrides["lr"] = base_lr * bias_lr_factor
+    if weight_decay_bias is not None:
+        bias_overrides["weight_decay"] = weight_decay_bias
+    if len(bias_overrides):
+        if "bias" in overrides:
+            raise ValueError("Conflicting overrides for 'bias'")
+        overrides["bias"] = bias_overrides
+    if lr_factor_func is not None:
+        if base_lr is None:
+            raise ValueError("lr_factor_func requires base_lr")
+    norm_module_types = (
+        torch.nn.BatchNorm1d,
+        torch.nn.BatchNorm2d,
+        torch.nn.BatchNorm3d,
+        torch.nn.SyncBatchNorm,
+        # NaiveSyncBatchNorm inherits from BatchNorm2d
+        torch.nn.GroupNorm,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.LocalResponseNorm,
+    )
+    params: List[Dict[str, Any]] = []
+    memo: Set[torch.nn.parameter.Parameter] = set()
+    for module_name, module in model.named_modules():
+        for module_param_name, value in module.named_parameters(recurse=False):
+            if not value.requires_grad:
+                continue
+            # Avoid duplicating parameters
+            if value in memo:
+                continue
+            memo.add(value)
+            hyperparams = copy.copy(defaults)
+            if isinstance(module, norm_module_types) and weight_decay_norm is not None:
+                hyperparams["weight_decay"] = weight_decay_norm
+            if lr_factor_func is not None:
+                hyperparams["lr"] *= lr_factor_func(f"{module_name}.{module_param_name}")
+            hyperparams.update(overrides.get(module_param_name, {}))
+            params.append({"params": [value], **hyperparams})
+    return reduce_param_groups(params)

spark/downstream_d2/train_net.py ADDED Viewed

	@@ -0,0 +1,322 @@

+#!/usr/bin/python3
+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import datetime
+import json
+import logging
+import os
+import time
+from collections import OrderedDict, defaultdict
+from functools import partial
+from pprint import pformat
+import numpy as np
+import torch
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import MetadataCatalog
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch, PeriodicWriter
+from detectron2.evaluation import (
+    CityscapesInstanceEvaluator,
+    CityscapesSemSegEvaluator,
+    COCOEvaluator,
+    COCOPanopticEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    PascalVOCDetectionEvaluator,
+    SemSegEvaluator,
+    verify_results,
+)
+from detectron2.layers import get_norm
+from detectron2.modeling import GeneralizedRCNNWithTTA
+from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads
+from detectron2.solver.build import maybe_add_gradient_clipping
+from detectron2.utils.events import EventWriter
+from lr_decay import get_default_optimizer_params, lr_factor_func
+# [modification] for better logging
+def _ex_repr(self):
+    d = vars(self)
+    ex = ', '.join(f'{k}={v}' for k, v in d.items() if not k.startswith('__') and k not in [
+        'trainer', 'before_train', 'after_train', 'before_step', 'after_step', 'state_dict',
+        '_model', '_data_loader', 'logger',
+    ])
+    return f'{type(self).__name__}({ex})'
+hooks.HookBase.__repr__ = _ex_repr
+EventWriter.__repr__ = _ex_repr
+# [modification] add norm
+@ROI_HEADS_REGISTRY.register()
+class Res5ROIHeadsExtraNorm(Res5ROIHeads):
+    """
+    As described in the MOCO paper, there is an extra BN layer
+    following the res5 stage.
+    """
+    def _build_res5_block(self, cfg):
+        seq, out_channels = super()._build_res5_block(cfg)
+        norm = cfg.MODEL.RESNETS.NORM
+        norm = get_norm(norm, out_channels)
+        seq.add_module("norm", norm)
+        return seq, out_channels
+class Trainer(DefaultTrainer):
+    """
+    We use the "DefaultTrainer" which contains pre-defined default logic for
+    standard training workflow. They may not work for you, especially if you
+    are working on a new research project. In that case you can write your
+    own training loop. You can use "tools/plain_train_net.py" as an example.
+    """
+    # [modification] override the `build_optimizer` for using Adam and layer-wise lr decay
+    lr_decay_ratio: float = 1.0
+    @classmethod
+    def build_optimizer(cls, cfg, model):
+        is_resnet50 = int(cfg.MODEL.RESNETS.DEPTH) == 50
+        if comm.is_main_process():
+            dbg = defaultdict(list)
+            for module_name, module in model.named_modules():
+                for module_param_name, value in module.named_parameters(recurse=False):
+                    if not value.requires_grad:
+                        continue
+                    lrf = lr_factor_func(f"{module_name}.{module_param_name}", is_resnet50=is_resnet50, dec=cls.lr_decay_ratio, debug=True)
+                    dbg[lrf].append(f"{module_name}.{module_param_name}")
+            for k in sorted(dbg.keys()):
+                print(f'[{k}] {sorted(dbg[k])}')
+            print()
+        params = get_default_optimizer_params(
+            model,
+            base_lr=cfg.SOLVER.BASE_LR,
+            weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM,
+            bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR,
+            weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS,
+            lr_factor_func=partial(lr_factor_func, is_resnet50=is_resnet50, dec=cls.lr_decay_ratio, debug=False)
+        )
+        opt_clz = {
+            'sgd': partial(torch.optim.SGD, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV),
+            'adamw': torch.optim.AdamW,
+            'adam': torch.optim.AdamW,
+        }[cfg.SOLVER.OPTIMIZER.lower()]
+        return maybe_add_gradient_clipping(cfg, opt_clz)(params, lr=cfg.SOLVER.BASE_LR, weight_decay=cfg.SOLVER.WEIGHT_DECAY)
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        return build_evaluator(cfg, dataset_name, output_folder)
+    @classmethod
+    def test_with_TTA(cls, cfg, model):
+        logger = logging.getLogger("detectron2.trainer")
+        # In the end of training, run an evaluation with TTA
+        # Only support some R-CNN models.
+        logger.info("Running inference with test-time augmentation ...")
+        model = GeneralizedRCNNWithTTA(cfg, model)
+        evaluators = [
+            cls.build_evaluator(
+                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
+            )
+            for name in cfg.DATASETS.TEST
+        ]
+        res = cls.test(cfg, model, evaluators)
+        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+        return res
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    # [modification] we add these two new keys
+    cfg.SOLVER.OPTIMIZER, cfg.SOLVER.LR_DECAY = 'sgd', 1.0  # by default using SGD and no lr_decay
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+def main(args):
+    cfg = setup(args)
+    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
+    # [modification] for implementing lr decay and for logging
+    Trainer.lr_decay_ratio = cfg.SOLVER.LR_DECAY
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if cfg.TEST.AUG.ENABLED:
+            res.update(Trainer.test_with_TTA(cfg, model))
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+    # [modification] just skip some warnings
+    import warnings
+    comm.synchronize()
+    warnings.filterwarnings('ignore', category=UserWarning)
+    _ = np.arange(3, dtype=np.int).astype(np.bool)
+    _ = np.array(torch.ones(3, dtype=torch.int32).numpy(), dtype=np.int)
+    _ = np.array(torch.ones(3, dtype=torch.int64).numpy(), dtype=np.int)
+    _ = np.array(torch.ones(3, dtype=torch.long).numpy(), dtype=np.int)
+    _ = torch.rand(100) // 5
+    _ = torch.meshgrid(torch.ones(1))
+    warnings.resetwarnings()
+    comm.synchronize()
+    """
+    If you'd like to do anything fancier than the standard training logic,
+    consider writing your own training loop (see plain_train_net.py) or
+    subclassing the trainer.
+    """
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    for h in trainer._hooks:
+        if isinstance(h, PeriodicWriter):
+            h._period = 1000  # [modification] less logging
+    # [modification] we add some hooks for logging
+    is_local_master = comm.get_rank() % args.num_gpus == 0
+    if comm.is_main_process():
+        print(f'[default hooks] {pformat(trainer._hooks, indent=2, width=300)}')
+    ex_hooks = [
+        hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model)) if cfg.TEST.AUG.ENABLED else None,
+        LogHook(cfg.TEST.EVAL_PERIOD, args.config_file, cfg.OUTPUT_DIR, is_local_master) if comm.is_main_process() else None,
+    ]
+    trainer.register_hooks(ex_hooks)
+    if comm.is_main_process():
+        print(f'[extra hooks] {pformat(ex_hooks, indent=2, width=300)}')
+    return trainer.train()
+# [modification] we add a hook for logging results to `cfg.OUTPUT_DIR/d2_coco_log.txt`
+class LogHook(hooks.HookBase):
+    def __init__(self, eval_period, config_file, output_dir, is_local_master):
+        self.eval_period = eval_period
+        self.log_period = eval_period // 4
+        self.log = {}
+        self.is_master = comm.is_main_process()
+        self.is_local_master = is_local_master
+        self.config_file = config_file
+        self.out_dir = output_dir
+        self.log_txt_name = os.path.join(self.out_dir, 'd2_coco_log.txt')
+    def __write_to_log_file(self, d):
+        if self.is_local_master:
+            self.log.update(d)
+            with open(self.log_txt_name, 'w') as fp:
+                json.dump(self.log, fp)
+                fp.write('\n')
+    def update_and_write_to_local_log(self):
+        stat = self.trainer.storage.latest()
+        self.log['boxAP'], self.log['bAP50'], self.log['bAP75'] = stat['bbox/AP'][0], stat['bbox/AP50'][0], stat['bbox/AP75'][0]
+        self.log['mskAP'], self.log['mAP50'], self.log['mAP75'] = stat['segm/AP'][0], stat['segm/AP50'][0], stat['segm/AP75'][0]
+        self.log['bAP-l'], self.log['bAP-m'], self.log['bAP-s'] = stat['bbox/APl'][0], stat['bbox/APm'][0], stat['bbox/APs'][0]
+        self.log['mAP-l'], self.log['mAP-m'], self.log['mAP-s'] = stat['segm/APl'][0], stat['segm/APm'][0], stat['segm/APs'][0]
+        all_ap = sorted([(v[0], k.split('AP-')[-1].strip()) for k, v in stat.items() if k.startswith('bbox/AP-')])
+        all_ap = [tu[1] for tu in all_ap]
+        self.log['easy'] = ' | '.join(all_ap[-7:])
+        self.log['hard'] = ' | '.join(all_ap[:7])
+        for k in self.log.keys():
+            if 'AP' in k:
+                self.log[k] = round(self.log[k], 3)
+        self.__write_to_log_file({})
+    def after_step(self):
+        next_iter = self.trainer.iter + 1
+        if self.eval_period > 0 and next_iter % self.eval_period == 0:
+            self.update_and_write_to_local_log()
+        if self.log_period > 0 and next_iter % self.log_period == 0:
+            stat = self.trainer.storage.latest()
+            remain_secs = round(stat['eta_seconds'][0])
+            d = {
+                'cfg': self.config_file,
+                'rema': str(datetime.timedelta(seconds=remain_secs)), 'fini': time.strftime("%m-%d %H:%M", time.localtime(time.time() + remain_secs)),
+                'cur_iter': f'{next_iter}/{self.trainer.max_iter}',
+            }
+            self.__write_to_log_file(d)
+    def after_train(self):
+        self.update_and_write_to_local_log()
+        last_boxAP, last_mskAP = round(self.log['boxAP'], 3), round(self.log['mskAP'], 3)
+        self.__write_to_log_file({
+            'rema': '-', 'fini': time.strftime("%m-%d %H:%M", time.localtime(time.time() - 120)),
+            'last_boxAP': last_boxAP,
+            'last_mskAP': last_mskAP,
+        })
+        time.sleep(5)
+        if self.is_master:
+            print(f'\n[finished] ========== last_boxAP={last_boxAP}, last_mskAP={last_mskAP} ==========\n')
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
+def build_evaluator(cfg, dataset_name, output_folder=None):
+    """
+    Create evaluator(s) for a given dataset.
+    This uses the special metadata "evaluator_type" associated with each builtin dataset.
+    For your own dataset, you can simply create an evaluator manually in your
+    script and do not have to worry about the hacky if-else logic here.
+    """
+    if output_folder is None:
+        output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+    evaluator_list = []
+    evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+    if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
+        evaluator_list.append(
+            SemSegEvaluator(
+                dataset_name,
+                distributed=True,
+                output_dir=output_folder,
+            )
+        )
+    if evaluator_type in ["coco", "coco_panoptic_seg"]:
+        evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
+    if evaluator_type == "coco_panoptic_seg":
+        evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
+    if evaluator_type == "cityscapes_instance":
+        return CityscapesInstanceEvaluator(dataset_name)
+    if evaluator_type == "cityscapes_sem_seg":
+        return CityscapesSemSegEvaluator(dataset_name)
+    elif evaluator_type == "pascal_voc":
+        return PascalVOCDetectionEvaluator(dataset_name)
+    elif evaluator_type == "lvis":
+        return LVISEvaluator(dataset_name, output_dir=output_folder)
+    if len(evaluator_list) == 0:
+        raise NotImplementedError(
+            "no Evaluator for the dataset {} with the type {}".format(dataset_name, evaluator_type)
+        )
+    elif len(evaluator_list) == 1:
+        return evaluator_list[0]
+    return DatasetEvaluators(evaluator_list)

spark/downstream_imagenet/README.md ADDED Viewed

	@@ -0,0 +1,54 @@

+## About code isolation
+This `downstream_imagenet` is isolated from pre-training codes. One can treat this `downstream_imagenet` as an independent codebase 🛠️.
+## Preparation for ImageNet-1k fine-tuning
+See [INSTALL.md](https://github.com/keyu-tian/SparK/blob/main/INSTALL.md) to prepare `pip` dependencies and the ImageNet dataset.
+**Note: for network definitions, we directly use `timm.models.ResNet` and [official ConvNeXt](https://github.com/facebookresearch/ConvNeXt/blob/048efcea897d999aed302f2639b6270aedf8d4c8/models/convnext.py).**
+## Fine-tuning on ImageNet-1k from pre-trained weights
+Run [/downstream_imagenet/main.py](/downstream_imagenet/main.py) via `torchrun`.
+**It is required to specify** the ImageNet data folder (`--data_path`), your experiment name & log dir (`--exp_name` and `--exp_dir`, automatically created if not exists), the model name (`--model`, valid choices see the keys of 'HP_DEFAULT_VALUES' in [/downstream_imagenet/arg.py line14](/downstream_imagenet/arg.py#L14)), and the pretrained weight file `--resume_from` to run fine-tuning.
+All the other configurations have their default values, listed in [/downstream_imagenet/arg.py#L13](/downstream_imagenet/arg.py#L13).
+You can overwrite any defaults by `--bs=1024` or something like that.
+Here is an example to pretrain a ConvNeXt-Small on an 8-GPU single machine:
+```shell script
+$ cd /path/to/SparK/downstream_imagenet
+$ torchrun --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr=localhost --master_port=<some_port> main.py \
+  --data_path=/path/to/imagenet --exp_name=<your_exp_name> --exp_dir=/path/to/logdir \
+  --model=convnext_small --resume_from=/some/path/to/convnextS_1kpretrained_official_style.pth
+```
+For multiple machines, change the `--nnodes` and `--master_addr` to your configurations. E.g.:
+```shell script
+$ torchrun --nproc_per_node=8 --nnodes=<your_nnodes> --node_rank=<rank_starts_from_0> --master_address=<some_address> --master_port=<some_port> main.py \
+  ...
+```
+## Logging
+See files under `--exp_dir` to track your experiment:
+- `<model>_1kfinetuned_last.pth`: the latest model weights
+- `<model>_1kfinetuned_best.pth`: model weights with the highest acc
+- `<model>_1kfinetuned_best_ema.pth`: EMA weights with the highest acc
+- `finetune_log.txt`: records some important information such as:
+    - `git_commit_id`: git version
+    - `cmd`: all arguments passed to the script
+    It also reports training loss/acc, best evaluation acc, and remaining time at each epoch.
+- `tensorboard_log/`: saves a lot of tensorboard logs, you can visualize accuracies, loss values, learning rates, gradient norms and more things via `tensorboard --logdir /path/to/this/tensorboard_log/ --port 23333`.
+## Resuming
+Use `--resume_from` again, like `--resume_from=path/to/<model>_1kfinetuned_last.pth`.

spark/downstream_imagenet/arg.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import sys
+from tap import Tap
+HP_DEFAULT_NAMES = ['bs', 'ep', 'wp_ep', 'opt', 'base_lr', 'lr_scale', 'wd', 'mixup', 'rep_aug', 'drop_path', 'ema']
+HP_DEFAULT_VALUES = {
+    'convnext_small':     (4096, 400, 20, 'adam', 0.0002,  0.7, 0.01, 0.8, 3, 0.3,  0.9999),
+    'convnext_base':      (4096, 400, 20, 'adam', 0.0001,  0.7, 0.01, 0.8, 3, 0.4,  0.9999),
+    'convnext_large':     (4096, 200, 10, 'adam', 0.0001,  0.7, 0.02, 0.8, 3, 0.5,  0.9999),
+    'convnext_large_384': (1024, 200, 20, 'adam', 0.00006, 0.7, 0.01, 0.8, 3, 0.5,  0.99995),
+    'resnet50':           (4096, 300, 5,  'lamb', 0.002,   0.7, 0.02, 0.1, 0, 0.05, 0.9999),
+    'resnet101':          (4096, 300, 5,  'lamb', 0.001,   0.8, 0.02, 0.1, 0, 0.2,  0.9999),
+    'resnet152':          (4096, 300, 5,  'lamb', 0.001,   0.8, 0.02, 0.1, 0, 0.2,  0.9999),
+    'resnet200':          (4096, 300, 5,  'lamb', 0.001,   0.8, 0.02, 0.1, 0, 0.2,  0.9999),
+}
+class FineTuneArgs(Tap):
+    # environment
+    exp_name: str
+    exp_dir: str
+    data_path: str
+    model: str
+    resume_from: str = ''   # resume from some checkpoint.pth
+    img_size: int = 640
+    dataloader_workers: int = 8
+    # ImageNet classification fine-tuning hyperparameters; see `HP_DEFAULT_VALUES` above for detailed default values
+    # - batch size, epoch
+    bs: int = 0             # global batch size (== batch_size_per_gpu * num_gpus)
+    ep: int = 0             # number of epochs
+    wp_ep: int = 0          # epochs for warmup
+    # - optimization
+    opt: str = ''           # optimizer; 'adam' or 'lamb'
+    base_lr: float = 0.     # lr == base_lr * (bs)
+    lr_scale: float = 0.    # see file `lr_decay.py` for more details
+    clip: int = -1          # use gradient clipping if clip > 0
+    # - regularization tricks
+    wd: float = 0.          # weight decay
+    mixup: float = 0.       # use mixup if mixup > 0
+    rep_aug: int = 0        # use repeated augmentation if rep_aug > 0
+    drop_path: float = 0.   # drop_path ratio
+    # - other tricks
+    ema: float = 0.         # use EMA if ema > 0
+    sbn: bool = True        # use SyncBatchNorm
+    # NO NEED TO SPECIFIED; each of these args would be updated in runtime automatically
+    lr: float = None
+    batch_size_per_gpu: int = 0
+    glb_batch_size: int = 0
+    device: str = 'cpu'
+    world_size: int = 1
+    global_rank: int = 0
+    local_rank: int = 0     # we DO USE this arg
+    is_master: bool = False
+    is_local_master: bool = False
+    cmd: str = ' '.join(sys.argv[1:])
+    commit_id: str = os.popen(f'git rev-parse HEAD').read().strip()
+    commit_msg: str = os.popen(f'git log -1').read().strip().splitlines()[-1].strip()
+    log_txt_name: str = '{args.exp_dir}/pretrain_log.txt'
+    tb_lg_dir: str = ''     # tensorboard log directory
+    train_loss: float = 0.
+    train_acc: float = 0.
+    best_val_acc: float = 0.
+    cur_ep: str = ''
+    remain_time: str = ''
+    finish_time: str = ''
+    first_logging: bool = True
+    def log_epoch(self):
+        if not self.is_local_master:
+            return
+        if self.first_logging:
+            self.first_logging = False
+            with open(self.log_txt_name, 'w') as fp:
+                json.dump({
+                    'name': self.exp_name, 'cmd': self.cmd, 'git_commit_id': self.commit_id, 'git_commit_msg': self.commit_msg,
+                    'model': self.model,
+                }, fp)
+                fp.write('\n\n')
+        with open(self.log_txt_name, 'a') as fp:
+            json.dump({
+                'cur_ep': self.cur_ep,
+                'train_L': self.train_loss, 'train_acc': self.train_acc,
+                'best_val_acc': self.best_val_acc,
+                'rema': self.remain_time, 'fini': self.finish_time,
+            }, fp)
+            fp.write('\n')
+def get_args(world_size, global_rank, local_rank, device) -> FineTuneArgs:
+    # parse args and prepare directories
+    args = FineTuneArgs(explicit_bool=True).parse_args()
+    d_name, b_name = os.path.dirname(os.path.abspath(args.exp_dir)), os.path.basename(os.path.abspath(args.exp_dir))
+    b_name = ''.join(ch if (ch.isalnum() or ch == '-') else '_' for ch in b_name)
+    args.exp_dir = os.path.join(d_name, b_name)
+    os.makedirs(args.exp_dir, exist_ok=True)
+    args.log_txt_name = os.path.join(args.exp_dir, 'finetune_log.txt')
+    args.tb_lg_dir = args.tb_lg_dir or os.path.join(args.exp_dir, 'tensorboard_log')
+    try: os.makedirs(args.tb_lg_dir, exist_ok=True)
+    except: pass
+    # fill in args.bs, args.ep, etc. with their default values (if their values are not explicitly specified, i.e., if bool(they) == False)
+    if args.model == 'convnext_large' and args.img_size == 384:
+        default_values = HP_DEFAULT_VALUES['convnext_large_384']
+    else:
+        default_values = HP_DEFAULT_VALUES[args.model]
+    for k, v in zip(HP_DEFAULT_NAMES, default_values):
+        if bool(getattr(args, k)) == False:
+            setattr(args, k, v)
+    # update other runtime args
+    args.world_size, args.global_rank, args.local_rank, args.device = world_size, global_rank, local_rank, device
+    args.is_master = global_rank == 0
+    args.is_local_master = local_rank == 0
+    args.batch_size_per_gpu = args.bs // world_size
+    args.glb_batch_size = args.batch_size_per_gpu * world_size
+    args.lr = args.base_lr * args.glb_batch_size / 256
+    return args

spark/downstream_imagenet/data.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import random
+import time
+import PIL.Image as PImage
+import numpy as np
+import torch
+import torchvision
+from timm.data import AutoAugment as TimmAutoAugment
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, create_transform
+from timm.data.distributed_sampler import RepeatAugSampler
+from timm.data.transforms_factory import transforms_imagenet_eval
+from torch.utils.data import DataLoader
+from torch.utils.data.sampler import Sampler
+from torchvision.transforms import AutoAugment as TorchAutoAugment
+from torchvision.transforms import transforms, TrivialAugmentWide
+try:
+    from torchvision.transforms import InterpolationMode
+    interpolation = InterpolationMode.BICUBIC
+except:
+    import PIL
+    interpolation = PIL.Image.BICUBIC
+def create_classification_dataset(data_path, img_size, rep_aug, workers, batch_size_per_gpu, world_size, global_rank):
+    import warnings
+    warnings.filterwarnings('ignore', category=UserWarning)
+    mean, std = IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+    trans_train = create_transform(
+        is_training=True, input_size=img_size,
+        auto_augment='v0', interpolation='bicubic', re_prob=0.25, re_mode='pixel', re_count=1,
+        mean=mean, std=std,
+    )
+    if img_size < 384:
+        for i, t in enumerate(trans_train.transforms):
+            if isinstance(t, (TorchAutoAugment, TimmAutoAugment)):
+                trans_train.transforms[i] = TrivialAugmentWide(interpolation=interpolation)
+                break
+        trans_val = transforms_imagenet_eval(img_size=img_size, interpolation='bicubic', crop_pct=0.95, mean=mean, std=std)
+    else:
+        trans_val = transforms.Compose([
+            transforms.Resize((img_size, img_size), interpolation=interpolation),
+            transforms.ToTensor(), transforms.Normalize(mean=mean, std=std),
+        ])
+    print_transform(trans_train, '[train]')
+    print_transform(trans_val, '[val]')
+    imagenet_folder = os.path.abspath(data_path)
+    for postfix in ('train', 'val'):
+        if imagenet_folder.endswith(postfix):
+            imagenet_folder = imagenet_folder[:-len(postfix)]
+    dataset_train = torchvision.datasets.ImageFolder(os.path.join(imagenet_folder, 'train'), trans_train)
+    dataset_val = torchvision.datasets.ImageFolder(os.path.join(imagenet_folder, 'val'), trans_val)
+    if rep_aug:
+        print(f'[dataset] using repeated augmentation: count={rep_aug}')
+        train_sp = RepeatAugSampler(dataset_train, shuffle=True, num_repeats=rep_aug)
+    else:
+        train_sp = torch.utils.data.distributed.DistributedSampler(dataset_train, shuffle=True, drop_last=True)
+    loader_train = DataLoader(
+        dataset=dataset_train, num_workers=workers, pin_memory=True,
+        batch_size=batch_size_per_gpu, sampler=train_sp, persistent_workers=workers > 0,
+        worker_init_fn=worker_init_fn,
+    )
+    iters_train = len(loader_train)
+    print(f'[dataset: train] bs={world_size}x{batch_size_per_gpu}={world_size * batch_size_per_gpu}, num_iters={iters_train}')
+    val_ratio = 2
+    loader_val = DataLoader(
+        dataset=dataset_val, num_workers=workers, pin_memory=True,
+        batch_sampler=DistInfiniteBatchSampler(world_size, global_rank, len(dataset_val), glb_batch_size=val_ratio * batch_size_per_gpu, filling=False, shuffle=False),
+        worker_init_fn=worker_init_fn,
+    )
+    iters_val = len(loader_val)
+    print(f'[dataset: val] bs={world_size}x{val_ratio * batch_size_per_gpu}={val_ratio * world_size * batch_size_per_gpu}, num_iters={iters_val}')
+    time.sleep(3)
+    warnings.resetwarnings()
+    return loader_train, iters_train, iter(loader_val), iters_val
+def worker_init_fn(worker_id):
+    # see: https://pytorch.org/docs/stable/notes/randomness.html#dataloader
+    worker_seed = torch.initial_seed() % 2 ** 32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+def print_transform(transform, s):
+    print(f'Transform {s} = ')
+    for t in transform.transforms:
+        print(t)
+    print('---------------------------\n')
+class DistInfiniteBatchSampler(Sampler):
+    def __init__(self, world_size, global_rank, dataset_len, glb_batch_size, seed=0, filling=False, shuffle=True):
+        assert glb_batch_size % world_size == 0
+        self.world_size, self.rank = world_size, global_rank
+        self.dataset_len = dataset_len
+        self.glb_batch_size = glb_batch_size
+        self.batch_size = glb_batch_size // world_size
+        self.iters_per_ep = (dataset_len + glb_batch_size - 1) // glb_batch_size
+        self.filling = filling
+        self.shuffle = shuffle
+        self.epoch = 0
+        self.seed = seed
+        self.indices = self.gener_indices()
+    def gener_indices(self):
+        global_max_p = self.iters_per_ep * self.glb_batch_size  # global_max_p % world_size must be 0 cuz glb_batch_size % world_size == 0
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch + self.seed)
+            global_indices = torch.randperm(self.dataset_len, generator=g)
+        else:
+            global_indices = torch.arange(self.dataset_len)
+        filling = global_max_p - global_indices.shape[0]
+        if filling > 0 and self.filling:
+            global_indices = torch.cat((global_indices, global_indices[:filling]))
+        global_indices = tuple(global_indices.numpy().tolist())
+        seps = torch.linspace(0, len(global_indices), self.world_size + 1, dtype=torch.int)
+        local_indices = global_indices[seps[self.rank]:seps[self.rank + 1]]
+        self.max_p = len(local_indices)
+        return local_indices
+    def __iter__(self):
+        self.epoch = 0
+        while True:
+            self.epoch += 1
+            p, q = 0, 0
+            while p < self.max_p:
+                q = p + self.batch_size
+                yield self.indices[p:q]
+                p = q
+            if self.shuffle:
+                self.indices = self.gener_indices()
+    def __len__(self):
+        return self.iters_per_ep

spark/downstream_imagenet/lr_decay.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from pprint import pformat
+def lr_wd_annealing(optimizer, peak_lr, wd, cur_it, wp_it, max_it):
+    wp_it = round(wp_it)
+    if cur_it < wp_it:
+        cur_lr = 0.005 * peak_lr + 0.995 * peak_lr * cur_it / wp_it
+    else:
+        ratio = (cur_it - wp_it) / (max_it - 1 - wp_it)
+        cur_lr = 0.001 * peak_lr + 0.999 * peak_lr * (0.5 + 0.5 * math.cos(math.pi * ratio))
+    min_lr, max_lr = cur_lr, cur_lr
+    min_wd, max_wd = wd, wd
+    for param_group in optimizer.param_groups:
+        scaled_lr = param_group['lr'] = cur_lr * param_group.get('lr_scale', 1)  # 'lr_scale' could be assigned
+        min_lr, max_lr = min(min_lr, scaled_lr), max(max_lr, scaled_lr)
+        scaled_wd = param_group['weight_decay'] = wd * param_group.get('weight_decay_scale', 1)  # 'weight_decay_scale' could be assigned
+        min_wd, max_wd = min(min_wd, scaled_wd), max(max_wd, scaled_wd)
+    return min_lr, max_lr, min_wd, max_wd
+def get_param_groups(model, nowd_keys=(), lr_scale=0.0):
+    using_lr_scale = hasattr(model, 'get_layer_id_and_scale_exp') and 0.0 < lr_scale < 1.0
+    print(f'[get_ft_param_groups][lr decay] using_lr_scale={using_lr_scale}, ft_lr_scale={lr_scale}')
+    para_groups, para_groups_dbg = {}, {}
+    for name, para in model.named_parameters():
+        if not para.requires_grad:
+            continue  # frozen weights
+        if len(para.shape) == 1 or name.endswith('.bias') or any(k in name for k in nowd_keys):
+            wd_scale, group_name = 0., 'no_decay'
+        else:
+            wd_scale, group_name = 1., 'decay'
+        if using_lr_scale:
+            layer_id, scale_exp = model.get_layer_id_and_scale_exp(name)
+            group_name = f'layer{layer_id}_' + group_name
+            this_lr_scale = lr_scale ** scale_exp
+            dbg = f'[layer {layer_id}][sc = {lr_scale} ** {scale_exp}]'
+        else:
+            this_lr_scale = 1
+            dbg = f'[no scale]'
+        if group_name not in para_groups:
+            para_groups[group_name] = {'params': [], 'weight_decay_scale': wd_scale, 'lr_scale': this_lr_scale}
+            para_groups_dbg[group_name] = {'params': [], 'weight_decay_scale': wd_scale, 'lr_scale': dbg}
+        para_groups[group_name]['params'].append(para)
+        para_groups_dbg[group_name]['params'].append(name)
+    for g in para_groups_dbg.values():
+        g['params'] = pformat(', '.join(g['params']), width=200)
+    print(f'[get_ft_param_groups] param groups = \n{pformat(para_groups_dbg, indent=2, width=250)}\n')
+    return list(para_groups.values())

spark/downstream_imagenet/main.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import datetime
+import time
+import torch
+import torch.distributed as tdist
+from timm.utils import ModelEmaV2
+from torch.utils.tensorboard import SummaryWriter
+from arg import get_args, FineTuneArgs
+from models import ConvNeXt, ResNet
+__for_timm_registration = ConvNeXt, ResNet
+from lr_decay import lr_wd_annealing
+from util import init_distributed_environ, create_model_opt, load_checkpoint, save_checkpoint
+from data import create_classification_dataset
+def main_ft():
+    world_size, global_rank, local_rank, device = init_distributed_environ()
+    args: FineTuneArgs = get_args(world_size, global_rank, local_rank, device)
+    print(f'initial args:\n{str(args)}')
+    args.log_epoch()
+    criterion, mixup_fn, model_without_ddp, model, model_ema, optimizer = create_model_opt(args)
+    ep_start, performance_desc = load_checkpoint(args.resume_from, model_without_ddp, model_ema, optimizer)
+    if ep_start >= args.ep: # load from a complete checkpoint file
+        print(f'  [*] [FT already done]    Max/Last Acc: {performance_desc}')
+    else:
+        tb_lg = SummaryWriter(args.tb_lg_dir) if args.is_master else None
+        loader_train, iters_train, iterator_val, iters_val = create_classification_dataset(
+            args.data_path, args.img_size, args.rep_aug,
+            args.dataloader_workers, args.batch_size_per_gpu, args.world_size, args.global_rank
+        )
+        # train & eval
+        tot_pred, last_acc = evaluate(args.device, iterator_val, iters_val, model)
+        max_acc = last_acc
+        max_acc_e = last_acc_e = evaluate(args.device, iterator_val, iters_val, model_ema.module)[-1]
+        print(f'[fine-tune] initial acc={last_acc:.2f}, ema={last_acc_e:.2f}')
+        ep_eval = set(range(0, args.ep//3, 5)) | set(range(args.ep//3, args.ep))
+        print(f'[FT start] ep_eval={sorted(ep_eval)} ')
+        print(f'[FT start] from ep{ep_start}')
+        params_req_grad = [p for p in model.parameters() if p.requires_grad]
+        ft_start_time = time.time()
+        for ep in range(ep_start, args.ep):
+            ep_start_time = time.time()
+            if hasattr(loader_train, 'sampler') and hasattr(loader_train.sampler, 'set_epoch'):
+                loader_train.sampler.set_epoch(ep)
+                if 0 <= ep <= 3:
+                    print(f'[loader_train.sampler.set_epoch({ep})]')
+            train_loss, train_acc = fine_tune_one_epoch(ep, args, tb_lg, loader_train, iters_train, criterion, mixup_fn, model, model_ema, optimizer, params_req_grad)
+            if ep in ep_eval:
+                eval_start_time = time.time()
+                tot_pred, last_acc = evaluate(args.device, iterator_val, iters_val, model)
+                tot_pred_e, last_acc_e = evaluate(args.device, iterator_val, iters_val, model_ema.module)
+                eval_cost = round(time.time() - eval_start_time, 2)
+                performance_desc = f'Max (Last) Acc: {max(max_acc, last_acc):.2f} ({last_acc:.2f} o {tot_pred})    EMA: {max(max_acc_e, last_acc_e):.2f} ({last_acc_e:.2f} o {tot_pred_e})'
+                states = model_without_ddp.state_dict(), model_ema.module.state_dict(), optimizer.state_dict()
+                if last_acc > max_acc:
+                    max_acc = last_acc
+                    save_checkpoint(f'{args.model}_1kfinetuned_best.pth', args, ep, performance_desc, *states)
+                if last_acc_e > max_acc_e:
+                    max_acc_e = last_acc_e
+                    save_checkpoint(f'{args.model}_1kfinetuned_best_ema.pth', args, ep, performance_desc, *states)
+                save_checkpoint(f'{args.model}_1kfinetuned_last.pth', args, ep, performance_desc, *states)
+            else:
+                eval_cost = '-'
+            ep_cost = round(time.time() - ep_start_time, 2) + 1    # +1s: approximate the following logging cost
+            remain_secs = (args.ep-1 - ep) * ep_cost
+            remain_time = datetime.timedelta(seconds=round(remain_secs))
+            finish_time = time.strftime("%m-%d %H:%M", time.localtime(time.time() + remain_secs))
+            print(f'[ep{ep}/{args.ep}]    {performance_desc}    Ep cost: {ep_cost}s,   Ev cost: {eval_cost},    Remain: {remain_time},    Finish @ {finish_time}')
+            args.cur_ep = f'{ep + 1}/{args.ep}'
+            args.remain_time, args.finish_time = str(remain_time), str(finish_time)
+            args.train_loss, args.train_acc, args.best_val_acc = train_loss, train_acc, max(max_acc, max_acc_e)
+            args.log_epoch()
+            if args.is_master:
+                tb_lg.add_scalar(f'ft_train/ep_loss', train_loss, ep)
+                tb_lg.add_scalar(f'ft_eval/max_acc', max_acc, ep)
+                tb_lg.add_scalar(f'ft_eval/last_acc', last_acc, ep)
+                tb_lg.add_scalar(f'ft_eval/max_acc_ema', max_acc_e, ep)
+                tb_lg.add_scalar(f'ft_eval/last_acc_ema', last_acc_e, ep)
+                tb_lg.add_scalar(f'ft_z_burnout/rest_hours', round(remain_secs/60/60, 2), ep)
+                tb_lg.flush()
+        # finish fine-tuning
+        result_acc = max(max_acc, max_acc_e)
+        if args.is_master:
+            tb_lg.add_scalar('ft_result/result_acc', result_acc, ep_start)
+            tb_lg.add_scalar('ft_result/result_acc', result_acc, args.ep)
+            tb_lg.flush()
+            tb_lg.close()
+        print(f'final args:\n{str(args)}')
+        print('\n\n')
+        print(f'  [*] [FT finished]    {performance_desc}    Total Cost: {(time.time() - ft_start_time) / 60 / 60:.1f}h\n')
+        print(f'  [*] [FT finished]    max(max_acc, max_acc_e)={result_acc}    EMA better={max_acc_e>max_acc}')
+        print('\n\n')
+        time.sleep(10)
+    args.remain_time, args.finish_time = '-', time.strftime("%m-%d %H:%M", time.localtime(time.time()))
+    args.log_epoch()
+def fine_tune_one_epoch(ep, args: FineTuneArgs, tb_lg: SummaryWriter, loader_train, iters_train, criterion, mixup_fn, model, model_ema: ModelEmaV2, optimizer, params_req_grad):
+    model.train()
+    tot_loss = tot_acc = 0.0
+    log_freq = max(1, round(iters_train * 0.7))
+    ep_start_time = time.time()
+    for it, (inp, tar) in enumerate(loader_train):
+        # adjust lr and wd
+        cur_it = it + ep * iters_train
+        min_lr, max_lr, min_wd, max_wd = lr_wd_annealing(optimizer, args.lr, args.wd, cur_it, args.wp_ep * iters_train, args.ep * iters_train)
+        # forward
+        inp = inp.to(args.device, non_blocking=True)
+        raw_tar = tar = tar.to(args.device, non_blocking=True)
+        if mixup_fn is not None:
+            inp, tar, raw_tar = mixup_fn(inp, tar)
+        oup = model(inp)
+        pred = oup.data.argmax(dim=1)
+        if mixup_fn is None:
+            acc = pred.eq(tar).float().mean().item() * 100
+            tot_acc += acc
+        else:
+            acc = (pred.eq(raw_tar) | pred.eq(raw_tar.flip(0))).float().mean().item() * 100
+            tot_acc += acc
+        # backward
+        optimizer.zero_grad()
+        loss = criterion(oup, tar)
+        loss.backward()
+        loss = loss.item()
+        tot_loss += loss
+        if args.clip > 0:
+            orig_norm = torch.nn.utils.clip_grad_norm_(params_req_grad, args.clip).item()
+        else:
+            orig_norm = None
+        optimizer.step()
+        model_ema.update(model)
+        torch.cuda.synchronize()
+        # log
+        if args.is_master and cur_it % log_freq == 0:
+            tb_lg.add_scalar(f'ft_train/it_loss', loss, cur_it)
+            tb_lg.add_scalar(f'ft_train/it_acc', acc, cur_it)
+            tb_lg.add_scalar(f'ft_hp/min_lr', min_lr, cur_it), tb_lg.add_scalar(f'ft_hp/max_lr', max_lr, cur_it)
+            tb_lg.add_scalar(f'ft_hp/min_wd', min_wd, cur_it), tb_lg.add_scalar(f'ft_hp/max_wd', max_wd, cur_it)
+            if orig_norm is not None:
+                tb_lg.add_scalar(f'ft_hp/orig_norm', orig_norm, cur_it)
+        if it in [3, iters_train//2, iters_train-1]:
+            remain_secs = (iters_train-1 - it) * (time.time() - ep_start_time) / (it + 1)
+            remain_time = datetime.timedelta(seconds=round(remain_secs))
+            print(f'[ep{ep} it{it:3d}/{iters_train}]    L: {loss:.4f}    Acc: {acc:.2f}    lr: {min_lr:.1e}~{max_lr:.1e}    Remain: {remain_time}')
+    return tot_loss / iters_train, tot_acc / iters_train
+@torch.no_grad()
+def evaluate(dev, iterator_val, iters_val, model):
+    training = model.training
+    model.train(False)
+    tot_pred, tot_correct = 0., 0.
+    for _ in range(iters_val):
+        inp, tar = next(iterator_val)
+        tot_pred += tar.shape[0]
+        inp = inp.to(dev, non_blocking=True)
+        tar = tar.to(dev, non_blocking=True)
+        oup = model(inp)
+        tot_correct += oup.argmax(dim=1).eq(tar).sum().item()
+    model.train(training)
+    t = torch.tensor([tot_pred, tot_correct]).to(dev)
+    tdist.all_reduce(t)
+    return t[0].item(), (t[1] / t[0]).item() * 100.
+if __name__ == '__main__':
+    main_ft()

spark/downstream_imagenet/mixup.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# This file is a modified version of timm.data.Mixup
+# Fixed error of "Batch size should be even when using this"
+""" Mixup and Cutmix
+Papers:
+mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412)
+CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899)
+Code Reference:
+CutMix: https://github.com/clovaai/CutMix-PyTorch
+Hacked together by / Copyright 2019, Ross Wightman
+"""
+import numpy as np
+import torch
+def one_hot(x, num_classes, on_value=1., off_value=0., device='cuda'):
+    x = x.long().view(-1, 1)
+    return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value)
+def mixup_target(target, num_classes, lam=1., smoothing=0.0, device='cuda'):
+    off_value = smoothing / num_classes
+    on_value = 1. - smoothing + off_value
+    y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device)
+    y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device)
+    return y1 * lam + y2 * (1. - lam)
+def rand_bbox(img_shape, lam, margin=0., count=None):
+    """ Standard CutMix bounding-box
+    Generates a random square bbox based on lambda value. This impl includes
+    support for enforcing a border margin as percent of bbox dimensions.
+    Args:
+        img_shape (tuple): Image shape as tuple
+        lam (float): Cutmix lambda value
+        margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
+        count (int): Number of bbox to generate
+    """
+    ratio = np.sqrt(1 - lam)
+    img_h, img_w = img_shape[-2:]
+    cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
+    margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
+    cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
+    cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
+    yl = np.clip(cy - cut_h // 2, 0, img_h)
+    yh = np.clip(cy + cut_h // 2, 0, img_h)
+    xl = np.clip(cx - cut_w // 2, 0, img_w)
+    xh = np.clip(cx + cut_w // 2, 0, img_w)
+    return yl, yh, xl, xh
+def rand_bbox_minmax(img_shape, minmax, count=None):
+    """ Min-Max CutMix bounding-box
+    Inspired by Darknet cutmix impl, generates a random rectangular bbox
+    based on min/max percent values applied to each dimension of the input image.
+    Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.
+    Args:
+        img_shape (tuple): Image shape as tuple
+        minmax (tuple or list): Min and max bbox ratios (as percent of image size)
+        count (int): Number of bbox to generate
+    """
+    assert len(minmax) == 2
+    img_h, img_w = img_shape[-2:]
+    cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
+    cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
+    yl = np.random.randint(0, img_h - cut_h, size=count)
+    xl = np.random.randint(0, img_w - cut_w, size=count)
+    yu = yl + cut_h
+    xu = xl + cut_w
+    return yl, yu, xl, xu
+def cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None):
+    """ Generate bbox and apply lambda correction.
+    """
+    if ratio_minmax is not None:
+        yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count)
+    else:
+        yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count)
+    if correct_lam or ratio_minmax is not None:
+        bbox_area = (yu - yl) * (xu - xl)
+        lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1])
+    return (yl, yu, xl, xu), lam
+class BatchMixup:
+    """ Mixup/Cutmix that applies different params to each element or whole batch
+    Args:
+        mixup_alpha (float): mixup alpha value, mixup is active if > 0.
+        cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
+        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
+        prob (float): probability of applying mixup or cutmix per batch or element
+        switch_prob (float): probability of switching to cutmix instead of mixup when both are active
+        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
+        correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders
+        label_smoothing (float): apply label smoothing to the mixed target tensor
+        num_classes (int): number of classes for target
+    """
+    def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5,
+                 mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000):
+        assert mode == 'batch'
+        self.mixup_alpha = mixup_alpha
+        self.cutmix_alpha = cutmix_alpha
+        self.cutmix_minmax = cutmix_minmax
+        if self.cutmix_minmax is not None:
+            assert len(self.cutmix_minmax) == 2
+            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
+            self.cutmix_alpha = 1.0
+        self.mix_prob = prob
+        self.switch_prob = switch_prob
+        self.label_smoothing = label_smoothing
+        self.num_classes = num_classes
+        self.mode = mode
+        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
+        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)
+    def _params_per_batch(self):
+        lam = 1.
+        use_cutmix = False
+        if self.mixup_enabled and np.random.rand() < self.mix_prob:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand() < self.switch_prob
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \
+                    np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = True
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = float(lam_mix)
+        return lam, use_cutmix
+    def _mix_batch(self, x):
+        lam, use_cutmix = self._params_per_batch()
+        if lam == 1.:
+            return 1.
+        if use_cutmix:
+            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                x.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+            x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh]
+        else:
+            x_flipped = x.flip(0).mul_(1. - lam)
+            x.mul_(lam).add_(x_flipped)
+        return lam
+    def __call__(self, x, raw_target):
+        if x.shape[0] % 2 == 1:
+            x, raw_target = torch.cat((x[:1], x), dim=0), torch.cat((raw_target[:1], raw_target), dim=0)
+        # assert len(x) % 2 == 0, 'Batch size should be even when using this'
+        lam = self._mix_batch(x)
+        target = mixup_target(raw_target, self.num_classes, lam, self.label_smoothing, x.device)
+        return x, target, raw_target

spark/downstream_imagenet/models/__init__.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+from timm.data import Mixup
+from timm.loss import BinaryCrossEntropy, SoftTargetCrossEntropy
+from timm.models.layers import drop
+from timm.models.resnet import ResNet
+from .convnext_official import ConvNeXt
+def convnext_get_layer_id_and_scale_exp(self: ConvNeXt, para_name: str):
+    N = 12 if len(self.stages[-2]) > 9 else 6
+    if para_name.startswith("downsample_layers"):
+        stage_id = int(para_name.split('.')[1])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1 or stage_id == 2:
+            layer_id = stage_id + 1
+        else:  # stage_id == 3:
+            layer_id = N
+    elif para_name.startswith("stages"):
+        stage_id = int(para_name.split('.')[1])
+        block_id = int(para_name.split('.')[2])
+        if stage_id == 0 or stage_id == 1:
+            layer_id = stage_id + 1
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        else:  # stage_id == 3:
+            layer_id = N
+    else:
+        layer_id = N + 1  # after backbone
+    return layer_id, N + 1 - layer_id
+def resnets_get_layer_id_and_scale_exp(self: ResNet, para_name: str):
+    # stages:
+    # 50  :    [3, 4, 6, 3]
+    # 101 :    [3, 4, 23, 3]
+    # 152 :    [3, 8, 36, 3]
+    # 200 :    [3, 24, 36, 3]
+    # eca269d: [3, 30, 48, 8]
+    L2, L3 = len(self.layer2), len(self.layer3)
+    if L2 == 4 and L3 == 6:
+        blk2, blk3 = 2, 3
+    elif L2 == 4 and L3 == 23:
+        blk2, blk3 = 2, 3
+    elif L2 == 8 and L3 == 36:
+        blk2, blk3 = 4, 4
+    elif L2 == 24 and L3 == 36:
+        blk2, blk3 = 4, 4
+    elif L2 == 30 and L3 == 48:
+        blk2, blk3 = 5, 6
+    else:
+        raise NotImplementedError
+    N2, N3 = math.ceil(L2 / blk2 - 1e-5), math.ceil(L3 / blk3 - 1e-5)
+    N = 2 + N2 + N3
+    if para_name.startswith('layer'):  # 1, 2, 3, 4, 5
+        stage_id, block_id = int(para_name.split('.')[0][5:]), int(para_name.split('.')[1])
+        if stage_id == 1:
+            layer_id = 1
+        elif stage_id == 2:
+            layer_id = 2 + block_id // blk2  # 2, 3
+        elif stage_id == 3:
+            layer_id = 2 + N2 + block_id // blk3  # r50: 4, 5    r101: 4, 5, ..., 11
+        else:  # == 4
+            layer_id = N  # r50: 6       r101: 12
+    elif para_name.startswith('fc.'):
+        layer_id = N + 1  # r50: 7       r101: 13
+    else:
+        layer_id = 0
+    return layer_id, N + 1 - layer_id  # r50: 0-7, 7-0   r101: 0-13, 13-0
+def _ex_repr(self):
+    return ', '.join(
+        f'{k}=' + (f'{v:g}' if isinstance(v, float) else str(v))
+        for k, v in vars(self).items()
+        if not k.startswith('_') and k != 'training'
+        and not isinstance(v, (torch.nn.Module, torch.Tensor))
+    )
+# IMPORTANT: update some member functions
+__UPDATED = False
+if not __UPDATED:
+    for clz in (torch.nn.CrossEntropyLoss, SoftTargetCrossEntropy, BinaryCrossEntropy, Mixup, drop.DropPath):
+        if hasattr(clz, 'extra_repr'):
+            clz.extra_repr = _ex_repr
+        else:
+            clz.__repr__ = lambda self: f'{type(self).__name__}({_ex_repr(self)})'
+    ResNet.get_layer_id_and_scale_exp = resnets_get_layer_id_and_scale_exp
+    ConvNeXt.get_layer_id_and_scale_exp = convnext_get_layer_id_and_scale_exp
+    __UPDATED = True

spark/downstream_imagenet/models/convnext_official.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# This file is exactly the same as: https://github.com/facebookresearch/ConvNeXt/blob/06f7b05f922e21914916406141f50f82b4a15852/models/convnext.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_, DropPath
+from timm.models.registry import register_model
+class Block(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
+                                  requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(self, in_chans=3, num_classes=1000,
+                 depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0.,
+                 layer_scale_init_value=1e-6, head_init_scale=1.,
+                 ):
+        super().__init__()
+        self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+        self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_path=dp_rates[cur + j],
+                        layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer
+        self.head = nn.Linear(dims[-1], num_classes)
+        self.apply(self._init_weights)
+        self.head.weight.data.mul_(head_init_scale)
+        self.head.bias.data.mul_(head_init_scale)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+    def forward_features(self, x):
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+        return self.norm(x.mean([-2, -1])) # global average pooling, (N, C, H, W) -> (N, C)
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+model_urls = {
+    "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
+    "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
+    "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
+    "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
+    "convnext_tiny_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth",
+    "convnext_small_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth",
+    "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
+    "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
+    "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
+}
+@register_model
+def convnext_tiny(pretrained=False,in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_tiny_22k'] if in_22k else model_urls['convnext_tiny_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
+        model.load_state_dict(checkpoint["model"])
+    return model
+@register_model
+def convnext_small(pretrained=False,in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_small_22k'] if in_22k else model_urls['convnext_small_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"])
+    return model
+@register_model
+def convnext_base(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_base_22k'] if in_22k else model_urls['convnext_base_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"])
+    return model
+@register_model
+def convnext_large(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_large_22k'] if in_22k else model_urls['convnext_large_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"])
+    return model
+@register_model
+def convnext_xlarge(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
+    if pretrained:
+        assert in_22k, "only ImageNet-22K pre-trained ConvNeXt-XL is available; please set in_22k=True"
+        url = model_urls['convnext_xlarge_22k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"])
+    return model

spark/downstream_imagenet/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+numpy
+Pillow
+typed-argument-parser
+timm==0.5.4
+tensorboardx

spark/downstream_imagenet/util.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import datetime
+import os
+import sys
+from functools import partial
+from typing import List, Tuple, Callable
+import pytz
+import torch
+import torch.distributed as tdist
+import torch.multiprocessing as tmp
+from timm import create_model
+from timm.loss import SoftTargetCrossEntropy, BinaryCrossEntropy
+from timm.optim import AdamW, Lamb
+from timm.utils import ModelEmaV2
+from torch.nn.parallel import DistributedDataParallel
+from torch.optim.optimizer import Optimizer
+from arg import FineTuneArgs
+from downstream_imagenet.mixup import BatchMixup
+from lr_decay import get_param_groups
+def time_str(for_dirname=False):
+    return datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime('%m-%d_%H-%M-%S' if for_dirname else '[%m-%d %H:%M:%S]')
+def init_distributed_environ():
+    # ref: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py#L29
+    if tmp.get_start_method(allow_none=True) is None:
+        tmp.set_start_method('spawn')
+    global_rank, num_gpus = int(os.environ.get('RANK', 'error')), torch.cuda.device_count()
+    local_rank = global_rank % num_gpus
+    torch.cuda.set_device(local_rank)
+    tdist.init_process_group(backend='nccl')
+    assert tdist.is_initialized(), 'torch.distributed is not initialized!'
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cudnn.deterministic = False
+    # print only when local_rank == 0 or print(..., force=True)
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def prt(msg, *args, **kwargs):
+        force = kwargs.pop('force', False)
+        if local_rank == 0 or force:
+            f_back = sys._getframe().f_back
+            file_desc = f'{f_back.f_code.co_filename:24s}'[-24:]
+            builtin_print(f'{time_str()} ({file_desc}, line{f_back.f_lineno:-4d})=> {msg}', *args, **kwargs)
+    __builtin__.print = prt
+    tdist.barrier()
+    return tdist.get_world_size(), global_rank, local_rank, torch.empty(1).cuda().device
+def create_model_opt(args: FineTuneArgs) -> Tuple[torch.nn.Module, Callable, torch.nn.Module, DistributedDataParallel, ModelEmaV2, Optimizer]:
+    num_classes = 1000
+    model_without_ddp: torch.nn.Module = create_model(args.model, num_classes=num_classes, drop_path_rate=args.drop_path).to(args.device)
+    model_para = f'{sum(p.numel() for p in model_without_ddp.parameters() if p.requires_grad) / 1e6:.1f}M'
+    # create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+    model_ema = ModelEmaV2(model_without_ddp, decay=args.ema, device=args.device)
+    if args.sbn:
+        model_without_ddp = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model_without_ddp)
+    print(f'[model={args.model}] [#para={model_para}, drop_path={args.drop_path}, ema={args.ema}] {model_without_ddp}\n')
+    model = DistributedDataParallel(model_without_ddp, device_ids=[args.local_rank], find_unused_parameters=False, broadcast_buffers=False)
+    model.train()
+    opt_cls = {
+        'adam': AdamW, 'adamw': AdamW,
+        'lamb': partial(Lamb, max_grad_norm=1e7, always_adapt=True, bias_correction=False),
+    }
+    param_groups: List[dict] = get_param_groups(model_without_ddp, nowd_keys={'cls_token', 'pos_embed', 'mask_token', 'gamma'}, lr_scale=args.lr_scale)
+    # param_groups[0] is like this: {'params': List[nn.Parameters], 'lr': float, 'lr_scale': float, 'weight_decay': float, 'weight_decay_scale': float}
+    optimizer = opt_cls[args.opt](param_groups, lr=args.lr, weight_decay=0)
+    print(f'[optimizer={type(optimizer)}]')
+    mixup_fn = BatchMixup(
+        mixup_alpha=args.mixup, cutmix_alpha=1.0, cutmix_minmax=None,
+        prob=1.0, switch_prob=0.5, mode='batch',
+        label_smoothing=0.1, num_classes=num_classes
+    )
+    mixup_fn.mixup_enabled = args.mixup > 0.0
+    if 'lamb' in args.opt:
+        # label smoothing is solved in AdaptiveMixup with `label_smoothing`, so here smoothing=0
+        criterion = BinaryCrossEntropy(smoothing=0, target_threshold=None)
+    else:
+        criterion = SoftTargetCrossEntropy()
+    print(f'[loss_fn] {criterion}')
+    print(f'[mixup_fn] {mixup_fn}')
+    return criterion, mixup_fn, model_without_ddp, model, model_ema, optimizer
+def load_checkpoint(resume_from, model_without_ddp, ema_module, optimizer):
+    if len(resume_from) == 0 or not os.path.exists(resume_from):
+        raise AttributeError(f'ckpt `{resume_from}` not found!')
+        # return 0, '[no performance_desc]'
+    print(f'[try to resume from file `{resume_from}`]')
+    checkpoint = torch.load(resume_from, map_location='cpu')
+    assert checkpoint.get('is_pretrain', False) == False, 'Please do not use `*_withdecoder_1kpretrained_spark_style.pth`, which is ONLY for resuming the pretraining. Use `*_1kpretrained_timm_style.pth` or `*_1kfinetuned*.pth` instead.'
+    ep_start, performance_desc = checkpoint.get('epoch', -1) + 1, checkpoint.get('performance_desc', '[no performance_desc]')
+    missing, unexpected = model_without_ddp.load_state_dict(checkpoint.get('module', checkpoint), strict=False)
+    print(f'[load_checkpoint] missing_keys={missing}')
+    print(f'[load_checkpoint] unexpected_keys={unexpected}')
+    print(f'[load_checkpoint] ep_start={ep_start}, performance_desc={performance_desc}')
+    if 'optimizer' in checkpoint:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+    if 'ema' in checkpoint:
+        ema_module.load_state_dict(checkpoint['ema'])
+    return ep_start, performance_desc
+def save_checkpoint(save_to, args, epoch, performance_desc, model_without_ddp_state, ema_state, optimizer_state):
+    checkpoint_path = os.path.join(args.exp_dir, save_to)
+    if args.is_local_master:
+        to_save = {
+            'args': str(args),
+            'arch': args.model,
+            'epoch': epoch,
+            'performance_desc': performance_desc,
+            'module': model_without_ddp_state,
+            'ema': ema_state,
+            'optimizer': optimizer_state,
+            'is_pretrain': False,
+        }
+        torch.save(to_save, checkpoint_path)

spark/downstream_mmdet/README.md ADDED Viewed

	@@ -0,0 +1,76 @@

+## About code isolation
+This `downstream_mmdet` is isolated from pre-training codes. One can treat this `downstream_mmdet` as an independent codebase 🛠️.
+## Fine-tuned ConvNeXt-B weights, log files, and performance
+<div align="center">
+[[`weights (pre-trained by SparK)`](https://drive.google.com/file/d/1ZjWbqI1qoBcqeQijI5xX9E-YNkxpJcYV/view?usp=share_link)]
+  [[`weights (fine-tuned on COCO)`](https://drive.google.com/file/d/1t10dmzg5KOO27o2yIglK-gQepB5gR4zR/view?usp=share_link)]
+  [[`log.json`](https://drive.google.com/file/d/1TuNboXl1qwjf1tggZ3QOssI67uU7Jtig/view?usp=share_link)]
+  [[`log`](https://drive.google.com/file/d/1JY5CkL_MX08zJ8P1FBIeC60OJsuIiyZc/view?usp=sharing)]
+</div>
+<p align="center">
+<img src="https://user-images.githubusercontent.com/39692511/211497396-cd031318-ef54-45a4-a283-cd9810c15603.png" width=80%>
+<p>
+## Installation [MMDetection with commit 6a979e2](https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/tree/6a979e2164e3fb0de0ca2546545013a4d71b2f7d) before fine-tuning ConvNeXt on COCO
+We refer to the codebases of [ConvNeXt](https://github.com/facebookresearch/ConvNeXt/tree/048efcea897d999aed302f2639b6270aedf8d4c8) and [Swin-Transformer-Object-Detection](https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/tree/6a979e2164e3fb0de0ca2546545013a4d71b2f7d).
+Please refer to [README.md](https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/6a979e2164e3fb0de0ca2546545013a4d71b2f7d/README.md) for installation and dataset preparation instructions.
+Note the COCO dataset folder should be at `downstream_mmdet/data/coco`.
+   The folder should follow the directory structure requried by `MMDetection`, which should look like this:
+```
+downstream_mmdet/data/coco:
+    annotations/:
+        captions_train2017.json  captions_val2017.json
+        instances_train2017.json  instances_val2017.json
+        person_keypoints_train2017.json  person_keypoints_val2017.json
+    train2017/:
+        a_lot_images.jpg
+    val2017/:
+        a_lot_images.jpg
+```
+### Training
+To train a detector with pre-trained models, run:
+```
+# single-gpu training
+python tools/train.py <CONFIG_FILE> --cfg-options model.pretrained=<PRETRAIN_MODEL> [other optional arguments]
+# multi-gpu training
+tools/dist_train.sh <CONFIG_FILE> <GPU_NUM> --cfg-options model.pretrained=<PRETRAIN_MODEL> [other optional arguments]
+```
+For example, to train a Mask R-CNN model with a SparK pretrained `ConvNeXt-B` backbone and 4 gpus, run:
+```
+tools/dist_train.sh configs/convnext_spark/mask_rcnn_convnext_base_patch4_window7_mstrain_480-800_adamw_3x_coco_in1k.py 4 \
+  --cfg-options model.pretrained=/some/path/to/official_convnext_base_1kpretrained.pth
+```
+The Mask R-CNN 3x fine-tuning config file can be found at [`configs/convnext_spark`](configs/convnext_spark). This config is basically a copy of [https://github.com/facebookresearch/ConvNeXt/blob/main/object_detection/configs/convnext/mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco_in1k.py](https://github.com/facebookresearch/ConvNeXt/blob/main/object_detection/configs/convnext/mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco_in1k.py).
+### Inference
+```
+# single-gpu testing
+python tools/test.py <CONFIG_FILE> <DET_CHECKPOINT_FILE> --eval bbox segm
+# multi-gpu testing
+tools/dist_test.sh <CONFIG_FILE> <DET_CHECKPOINT_FILE> <GPU_NUM> --eval bbox segm
+```
+## Acknowledgment
+We appreciate these useful codebases:
+- [MMDetection](https://github.com/open-mmlab/mmdetection)
+- [ConvNeXt](https://github.com/facebookresearch/ConvNeXt)
+- [Swin-Transformer-Object-Detection](https://github.com/SwinTransformer/Swin-Transformer-Object-Detection)

spark/downstream_mmdet/configs/_base_/default_runtime.py ADDED Viewed

	@@ -0,0 +1,16 @@

+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='CustomizedTextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+custom_hooks = [dict(type='NumClassCheckHook')]
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]

spark/downstream_mmdet/configs/_base_/models/cascade_mask_rcnn_convnext_fpn.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    pretrained=None,
+    backbone=dict(
+        type='ConvNeXt',
+        in_chans=3,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.2,
+        layer_scale_init_value=1e-6,
+        out_indices=[0, 1, 2, 3],
+    ),
+    neck=dict(
+        type='FPN',
+        in_channels=[128, 256, 512, 1024],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg = dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg = dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))

spark/downstream_mmdet/configs/_base_/models/mask_rcnn_convnext_fpn.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# model settings
+model = dict(
+    type='MaskRCNN',
+    pretrained=None,
+    backbone=dict(
+        type='ConvNeXt',
+        in_chans=3,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.2,
+        layer_scale_init_value=1e-6,
+        out_indices=[0, 1, 2, 3],
+    ),
+    neck=dict(
+        type='FPN',
+        in_channels=[128, 256, 512, 1024],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))

spark/downstream_mmdet/configs/convnext_spark/mask_rcnn_convnext_base_patch4_window7_mstrain_480-800_adamw_3x_coco_in1k.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+We directly take the ConvNeXt-T+MaskRCNN 3x recipe from https://github.com/facebookresearch/ConvNeXt/blob/main/object_detection/configs/convnext/mask_rcnn_convnext_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco_in1k.py
+And we modify  this  ConvNeXt-T+MaskRCNN 3x recipe to our ConvNeXt-B+MaskRCNN 3x recipe.
+The modifications (commented as [modified] below) are according to:
+- 1. tiny-to-base: (some configs of ConvNext-T are updated to those of ConvNext-B, referring to https://github.com/facebookresearch/ConvNeXt/blob/main/object_detection/configs/convnext/cascade_mask_rcnn_convnext_base_patch4_window7_mstrain_480-800_giou_4conv1f_adamw_3x_coco_in22k.py)
+    - model.backbone.{depths, dims, drop_path_rate}
+    - models.neck
+    - optimizer.paramwise_cfg.num_layers
+- 2. our paper (https://openreview.net/forum?id=NRxydtWup1S, or https://arxiv.org/abs/2301.03580):
+    - LR layer decay (optimizer.paramwise_cfg.decay_rate): 0.65
+    - LR scheduled ratio (lr_config.gamma): 0.2
+    - Learning rate (optimizer.lr): 0.0002
+    - optimizer_config.use_fp16: False (we just use fp32 by default; actually we didn't test the performance of using fp16)
+"""
+_base_ = [
+    '../_base_/models/mask_rcnn_convnext_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(
+        in_chans=3,
+        depths=[3, 3, 27, 3],           # [modified] according to tiny-to-base
+        dims=[128, 256, 512, 1024],     # [modified] according to tiny-to-base
+        drop_path_rate=0.5,             # [modified] according to tiny-to-base
+        layer_scale_init_value=1.0,
+        out_indices=[0, 1, 2, 3],
+    ),
+    neck=dict(in_channels=[128, 256, 512, 1024]))   # [modified] according to tiny-to-base
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='AutoAugment',
+         policies=[
+             [
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                                 (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                                 (736, 1333), (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True)
+             ],
+             [
+                 dict(type='Resize',
+                      img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True),
+                 dict(type='RandomCrop',
+                      crop_type='absolute_range',
+                      crop_size=(384, 600),
+                      allow_negative_crop=True),
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                 (576, 1333), (608, 1333), (640, 1333),
+                                 (672, 1333), (704, 1333), (736, 1333),
+                                 (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      override=True,
+                      keep_ratio=True)
+             ]
+         ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+data = dict(train=dict(pipeline=train_pipeline))
+optimizer = dict(constructor='LearningRateDecayOptimizerConstructor', _delete_=True, type='AdamW',
+                 lr=0.0002, betas=(0.9, 0.999), weight_decay=0.05,  # [modified] according to our paper
+                 paramwise_cfg={'decay_rate': 0.65,                 # [modified] according to our paper
+                                'decay_type': 'layer_wise',
+                                'num_layers': 12})                  # [modified] according to tiny-to-base
+lr_config = dict(step=[27, 33], gamma=0.2)                          # [modified] according to our paper
+runner = dict(type='EpochBasedRunnerAmp', max_epochs=36)
+# do not use mmdet version fp16
+fp16 = None
+optimizer_config = dict(
+    type="DistOptimizerHook",
+    update_interval=1,
+    grad_clip=None,
+    coalesce=True,
+    bucket_size_mb=-1,
+    use_fp16=False,              # [modified] True => False
+)

spark/downstream_mmdet/mmcv_custom/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# -*- coding: utf-8 -*-
+from .checkpoint import load_checkpoint
+from .layer_decay_optimizer_constructor import LearningRateDecayOptimizerConstructor
+from .customized_text import CustomizedTextLoggerHook
+__all__ = ['load_checkpoint', 'LearningRateDecayOptimizerConstructor', 'CustomizedTextLoggerHook']

spark/downstream_mmdet/mmcv_custom/customized_text.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import datetime
+from collections import OrderedDict
+import torch
+import mmcv
+from mmcv.runner import HOOKS
+from mmcv.runner import TextLoggerHook
+@HOOKS.register_module()
+class CustomizedTextLoggerHook(TextLoggerHook):
+    """Customized Text Logger hook.
+    This logger prints out both lr and layer_0_lr.
+    """
+    def _log_info(self, log_dict, runner):
+        # print exp name for users to distinguish experiments
+        # at every ``interval_exp_name`` iterations and the end of each epoch
+        if runner.meta is not None and 'exp_name' in runner.meta:
+            if (self.every_n_iters(runner, self.interval_exp_name)) or (
+                    self.by_epoch and self.end_of_epoch(runner)):
+                exp_info = f'Exp name: {runner.meta["exp_name"]}'
+                runner.logger.info(exp_info)
+        if log_dict['mode'] == 'train':
+            lr_str = {}
+            for lr_type in ['lr', 'layer_0_lr']:
+                if isinstance(log_dict[lr_type], dict):
+                    lr_str[lr_type] = []
+                    for k, val in log_dict[lr_type].items():
+                        lr_str.append(f'{lr_type}_{k}: {val:.3e}')
+                    lr_str[lr_type] = ' '.join(lr_str)
+                else:
+                    lr_str[lr_type] = f'{lr_type}: {log_dict[lr_type]:.3e}'
+            # by epoch: Epoch [4][100/1000]
+            # by iter:  Iter [100/100000]
+            if self.by_epoch:
+                log_str = f'Epoch [{log_dict["epoch"]}]' \
+                          f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
+            else:
+                log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
+            log_str += f'{lr_str["lr"]}, {lr_str["layer_0_lr"]}, '
+            if 'time' in log_dict.keys():
+                self.time_sec_tot += (log_dict['time'] * self.interval)
+                time_sec_avg = self.time_sec_tot / (
+                    runner.iter - self.start_iter + 1)
+                eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
+                eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+                log_str += f'eta: {eta_str}, '
+                log_str += f'time: {log_dict["time"]:.3f}, ' \
+                           f'data_time: {log_dict["data_time"]:.3f}, '
+                # statistic memory
+                if torch.cuda.is_available():
+                    log_str += f'memory: {log_dict["memory"]}, '
+        else:
+            # val/test time
+            # here 1000 is the length of the val dataloader
+            # by epoch: Epoch[val] [4][1000]
+            # by iter: Iter[val] [1000]
+            if self.by_epoch:
+                log_str = f'Epoch({log_dict["mode"]}) ' \
+                    f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
+            else:
+                log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'
+        log_items = []
+        for name, val in log_dict.items():
+            # TODO: resolve this hack
+            # these items have been in log_str
+            if name in [
+                    'mode', 'Epoch', 'iter', 'lr', 'layer_0_lr', 'time', 'data_time',
+                    'memory', 'epoch'
+            ]:
+                continue
+            if isinstance(val, float):
+                val = f'{val:.4f}'
+            log_items.append(f'{name}: {val}')
+        log_str += ', '.join(log_items)
+        runner.logger.info(log_str)
+    def log(self, runner):
+        if 'eval_iter_num' in runner.log_buffer.output:
+            # this doesn't modify runner.iter and is regardless of by_epoch
+            cur_iter = runner.log_buffer.output.pop('eval_iter_num')
+        else:
+            cur_iter = self.get_iter(runner, inner_iter=True)
+        log_dict = OrderedDict(
+            mode=self.get_mode(runner),
+            epoch=self.get_epoch(runner),
+            iter=cur_iter)
+        # record lr and layer_0_lr
+        cur_lr = runner.current_lr()
+        if isinstance(cur_lr, list):
+            log_dict['layer_0_lr'] = min(cur_lr)
+            log_dict['lr'] = max(cur_lr)
+        else:
+            assert isinstance(cur_lr, dict)
+            log_dict['lr'], log_dict['layer_0_lr'] = {}, {}
+            for k, lr_ in cur_lr.items():
+                assert isinstance(lr_, list)
+                log_dict['layer_0_lr'].update({k: min(lr_)})
+                log_dict['lr'].update({k: max(lr_)})
+        if 'time' in runner.log_buffer.output:
+            # statistic memory
+            if torch.cuda.is_available():
+                log_dict['memory'] = self._get_max_memory(runner)
+        log_dict = dict(log_dict, **runner.log_buffer.output)
+        self._log_info(log_dict, runner)
+        self._dump_log(log_dict, runner)
+        return log_dict

spark/downstream_mmdet/mmcv_custom/layer_decay_optimizer_constructor.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
+from mmcv.runner import get_dist_info
+def get_num_layer_layer_wise(var_name, num_max_layer=12):
+    if var_name in ("backbone.cls_token", "backbone.mask_token", "backbone.pos_embed"):
+        return 0
+    elif var_name.startswith("backbone.downsample_layers"):
+        stage_id = int(var_name.split('.')[2])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3
+        elif stage_id == 3:
+            layer_id = num_max_layer
+        return layer_id
+    elif var_name.startswith("backbone.stages"):
+        stage_id = int(var_name.split('.')[2])
+        block_id = int(var_name.split('.')[3])
+        if stage_id == 0:
+            layer_id = 1
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = num_max_layer
+        return layer_id
+    else:
+        return num_max_layer + 1
+def get_num_layer_stage_wise(var_name, num_max_layer):
+    if var_name in ("backbone.cls_token", "backbone.mask_token", "backbone.pos_embed"):
+        return 0
+    elif var_name.startswith("backbone.downsample_layers"):
+        return 0
+    elif var_name.startswith("backbone.stages"):
+        stage_id = int(var_name.split('.')[2])
+        return stage_id + 1
+    else:
+        return num_max_layer - 1
+@OPTIMIZER_BUILDERS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimizerConstructor):
+    def add_params(self, params, module, prefix='', is_dcn_module=None):
+        """Add all parameters of module to the params list.
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        parameter_groups = {}
+        print(self.paramwise_cfg)
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', "layer_wise")
+        print("Build LearningRateDecayOptimizerConstructor %s %f - %d" % (decay_type, decay_rate, num_layers))
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith(".bias") or name in ('pos_embed', 'cls_token'):
+                group_name = "no_decay"
+                this_weight_decay = 0.
+            else:
+                group_name = "decay"
+                this_weight_decay = weight_decay
+            if decay_type == "layer_wise":
+                layer_id = get_num_layer_layer_wise(name, self.paramwise_cfg.get('num_layers'))
+            elif decay_type == "stage_wise":
+                layer_id = get_num_layer_stage_wise(name, num_layers)
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+            if group_name not in parameter_groups:
+                scale = decay_rate ** (num_layers - layer_id - 1)
+                parameter_groups[group_name] = {
+                    "weight_decay": this_weight_decay,
+                    "params": [],
+                    "param_names": [],
+                    "lr_scale": scale,
+                    "group_name": group_name,
+                    "lr": scale * self.base_lr,
+                }
+            parameter_groups[group_name]["params"].append(param)
+            parameter_groups[group_name]["param_names"].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    "param_names": parameter_groups[key]["param_names"],
+                    "lr_scale": parameter_groups[key]["lr_scale"],
+                    "lr": parameter_groups[key]["lr"],
+                    "weight_decay": parameter_groups[key]["weight_decay"],
+                }
+            print("Param groups = %s" % json.dumps(to_display, indent=2))
+        params.extend(parameter_groups.values())

spark/downstream_mmdet/mmcv_custom/runner/checkpoint.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) Open-MMLab. All rights reserved.
+import os.path as osp
+import time
+from tempfile import TemporaryDirectory
+import torch
+from torch.optim import Optimizer
+import mmcv
+from mmcv.parallel import is_module_wrapper
+from mmcv.runner.checkpoint import weights_to_cpu, get_state_dict
+try:
+    import apex
+except:
+    print('apex is not installed')
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+    """Save checkpoint to file.
+    The checkpoint will have 4 fields: ``meta``, ``state_dict`` and
+    ``optimizer``, ``amp``. By default ``meta`` will contain version
+    and time info.
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+    if is_module_wrapper(model):
+        model = model.module
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+    # save amp state dict in the checkpoint
+    # checkpoint['amp'] = apex.amp.state_dict()
+    if filename.startswith('pavi://'):
+        try:
+            from pavi import modelcloud
+            from pavi.exception import NodeNotFoundError
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        mmcv.mkdir_or_exist(osp.dirname(filename))
+        # immediately flush buffer
+        with open(filename, 'wb') as f:
+            torch.save(checkpoint, f)
+            f.flush()

spark/downstream_mmdet/mmdet/models/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .darknet import Darknet
+from .detectors_resnet import DetectoRS_ResNet
+from .detectors_resnext import DetectoRS_ResNeXt
+from .hourglass import HourglassNet
+from .hrnet import HRNet
+from .regnet import RegNet
+from .res2net import Res2Net
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1d
+from .resnext import ResNeXt
+from .ssd_vgg import SSDVGG
+from .trident_resnet import TridentResNet
+from .swin_transformer import SwinTransformer
+from .convnext import ConvNeXt
+__all__ = [
+    'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'Res2Net',
+    'HourglassNet', 'DetectoRS_ResNet', 'DetectoRS_ResNeXt', 'Darknet',
+    'ResNeSt', 'TridentResNet', 'SwinTransformer', 'ConvNeXt'
+]

spark/downstream_mmdet/mmdet/models/backbones/convnext.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_, DropPath
+from mmcv_custom import load_checkpoint
+from mmdet.utils import get_root_logger
+from ..builder import BACKBONES
+class Block(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
+                                    requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+@BACKBONES.register_module()
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(self, in_chans=3, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768],
+                 drop_path_rate=0., layer_scale_init_value=1e-6, out_indices=[0, 1, 2, 3],
+                 ):
+        super().__init__()
+        self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                    LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                    nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+        self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_path=dp_rates[cur + j],
+                layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.out_indices = out_indices
+        norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first")
+        for i_layer in range(4):
+            layer = norm_layer(dims[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        if isinstance(pretrained, str):
+            self.apply(_init_weights)
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
+    def forward_features(self, x):
+        outs = []
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x)
+                outs.append(x_out)
+        return tuple(outs)
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x

spark/pretrain/README.md ADDED Viewed

	@@ -0,0 +1,118 @@

+## Preparation for ImageNet-1k pretraining
+See [/INSTALL.md](/INSTALL.md) to prepare `pip` dependencies and the ImageNet dataset.
+**Note: for neural network definitions, we directly use `timm.models.ResNet` and [official ConvNeXt](https://github.com/facebookresearch/ConvNeXt/blob/048efcea897d999aed302f2639b6270aedf8d4c8/models/convnext.py).**
+## Tutorial for pretraining your own CNN model
+See [/pretrain/models/custom.py](/pretrain/models/custom.py). Your todo list is:
+- implement `get_downsample_ratio` in [/pretrain/models/custom.py line20](/pretrain/models/custom.py#L20).
+- implement `get_feature_map_channels` in [/pretrain/models/custom.py line29](/pretrain/models/custom.py#L29).
+- implement `forward` in [/pretrain/models/custom.py line38](/pretrain/models/custom.py#L38).
+- define `your_convnet(...)` with `@register_model` in [/pretrain/models/custom.py line54](/pretrain/models/custom.py#L53-L54).
+- add default kwargs of `your_convnet(...)` in [/pretrain/models/\_\_init\_\_.py line34](/pretrain/models/__init__.py#L34).
+- **Note: see [#54](/../../issues/54) if your CNN contains SE module or global average pooling layer, and see [#56](/../../issues/56) if it contains GroupNorm**.
+Then run the experiment with `--model=your_convnet`.
+## Tutorial for pretraining on your own dataset
+See the comment of `build_dataset_to_pretrain` in [line55 of /pretrain/utils/imagenet.py](/pretrain/utils/imagenet.py#L55). Your todo list:
+- Define a subclass of `torch.utils.data.Dataset` for your own unlabeled dataset, to replace our `ImageNetDataset`.
+- Use `args.data_path` and `args.input_size` to help build your dataset, with `--data_path=... --input_size=...` to specify them.
+- Note the batch size `--bs` is the total batch size of all GPU, which may need to be adjusted based on your dataset size. FYI: we use `--bs=4096` for ImageNet, which contains 1.28 million images.
+**If your dataset is relatively small**, you can try `--init_weight=/path/to/res50_withdecoder_1kpretrained_spark_style.pth` to do your pretraining *from our pretrained weights*, rather than *form scratch*.
+## Debug on 1 GPU (without DistributedDataParallel)
+Use a small batch size `--bs=32` for avoiding OOM.
+```shell script
+python3 main.py --exp_name=debug --data_path=/path/to/imagenet --model=resnet50 --bs=32
+```
+## Pretraining Any Model on ImageNet-1k (224x224)
+For pretraining, run [/pretrain/main.py](/pretrain/main.py) with `torchrun`.
+**It is required to specify** the ImageNet data folder (`--data_path`), your experiment name & log dir (`--exp_name` and `--exp_dir`, automatically created if not exists), and the model name (`--model`, valid choices see the keys of 'pretrain_default_model_kwargs' in [/pretrain/models/\_\_init\_\_.py line34](/pretrain/models/__init__.py#L34)).
+We use the **same** pretraining configurations (lr, batch size, etc.) for all models (ResNets and ConvNeXts) in 224 pretraining.
+Their **names** and **default values** are in [/pretrain/utils/arg_util.py line23-44](/pretrain/utils/arg_util.py#L23-L44).
+All these default configurations (like batch size 4096) would be used, unless you specify some like `--bs=512`.
+**Note: the batch size `--bs` is the total batch size of all GPU, and the learning rate `--base_lr` is the base lr. The actual lr would be `lr = base_lr * bs / 256`, as in [/pretrain/utils/arg_util.py line131](/pretrain/utils/arg_util.py#L131). So do not use `--lr` to specify a lr (that will be ignored)**
+Here is an example to pretrain a ResNet50 on an 8-GPU single machine (we use DistributedDataParallel), overwriting the default batch size to 512:
+```shell script
+$ cd /path/to/SparK/pretrain
+$ torchrun --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr=localhost --master_port=<some_port> main.py \
+  --data_path=/path/to/imagenet --exp_name=<your_exp_name> --exp_dir=/path/to/logdir \
+  --model=resnet50 --bs=512
+```
+For multiple machines, change the `--nnodes`, `--node_rank`, `--master_address` and `--master_port` to your configurations. E.g.:
+```shell script
+$ torchrun --nproc_per_node=8 --nnodes=<your_nnodes> --node_rank=<rank_starts_from_0> --master_address=<some_address> --master_port=<some_port> main.py \
+  ...
+```
+## Pretraining ConvNeXt-Large on ImageNet-1k (384x384)
+For 384 pretraining we use a larger mask ratio (0.75), a half batch size (2048), and a double base learning rate (4e-4):
+```shell script
+$ cd /path/to/SparK/pretrain
+$ torchrun --nproc_per_node=8 --nnodes=<your_nnodes> --node_rank=<rank_starts_from_0> --master_address=<some_address> --master_port=<some_port> main.py \
+  --data_path=/path/to/imagenet --exp_name=<your_exp_name> --exp_dir=/path/to/logdir \
+  --model=convnext_large --input_size=384 --mask=0.75 --bs=2048 --base_lr=4e-4
+```
+## Logging
+See files in your `--exp_dir` to track your experiment:
+- `<model>_withdecoder_1kpretrained_spark_style.pth`: saves model and optimizer states, current epoch, current reconstruction loss, etc.; can be used to resume pretraining; can also be used for visualization in [/pretrain/viz_reconstruction.ipynb](/pretrain/viz_reconstruction.ipynb)
+- `<model>_1kpretrained_timm_style.pth`: can be used for downstream finetuning
+- `pretrain_log.txt`: records some important information such as:
+    - `git_commit_id`: git version
+    - `cmd`: the command of this experiment
+    It also reports the loss and remaining pretraining time.
+- `tensorboard_log/`: saves a lot of tensorboard logs including loss values, learning rates, gradient norms and more things. Use `tensorboard --logdir /path/to/this/tensorboard_log/ --port 23333` for viz.
+- `stdout_backup.txt` and `stderr_backup.txt`: backups stdout/stderr.
+## Resuming
+Specify `--resume_from=path/to/<model>_withdecoder_1kpretrained_spark_style.pth` to resume pretraining. Note this is different from `--init_weight`:
+- `--resume_from` will load three things: model weights, optimizer states, and current epoch, so it is used to resume some interrupted experiment (will start from that 'current epoch').
+- `--init_weight` ONLY loads the model weights, so it's just like a model initialization (will start from epoch 0).
+## Regarding sparse convolution
+We do not use sparse convolutions in this pytorch implementation, due to their limited optimization on modern hardware.
+As can be found in [/pretrain/encoder.py](/pretrain/encoder.py), we use masked dense convolution to simulate submanifold sparse convolution.
+We also define some sparse pooling or normalization layers in [/pretrain/encoder.py](/pretrain/encoder.py).
+All these "sparse" layers are implemented through pytorch built-in operators.
+## Some details: how we mask images and how to set the patch size
+In SparK, the mask patch size **equals to** the downsample ratio of the CNN model (so there is no configuration like `--patch_size=32`).
+Here is the reason: when we do mask, we:
+1. first generate the binary mask for the **smallest** resolution feature map, i.e., generate the `_cur_active` or `active_b1ff` in [/pretrain/spark.py line86-87](/pretrain/spark.py#L86-L87), which is a `torch.BoolTensor` shaped as `[B, 1, fmap_h, fmap_w]`, and would be used to mask the smallest feature map.
+3. then progressively upsample it (i.e., expand its 2nd and 3rd dimensions by calling `repeat_interleave(..., dim=2)` and `repeat_interleave(..., dim=3)` in [/pretrain/encoder.py line16](/pretrain/encoder.py#L16)), to mask those feature maps ([`x` in line21](/pretrain/encoder.py#L21)) with larger resolutions .
+So if you want a patch size of 16 or 8, you should actually define a new CNN model with a downsample ratio of 16 or 8.
+See [Tutorial for pretraining your own CNN model (above)](https://github.com/keyu-tian/SparK/tree/main/pretrain/#tutorial-for-pretraining-your-own-cnn-model).

spark/pretrain/decoder.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import List
+import torch
+import torch.nn as nn
+from timm.models.layers import trunc_normal_
+from utils.misc import is_pow2n
+class UNetBlock(nn.Module):
+    def __init__(self, cin, cout, bn2d):
+        """
+        a UNet block with 2x up sampling
+        """
+        super().__init__()
+        self.up_sample = nn.ConvTranspose2d(cin, cin, kernel_size=4, stride=2, padding=1, bias=True)
+        self.conv = nn.Sequential(
+            nn.Conv2d(cin, cin, kernel_size=3, stride=1, padding=1, bias=False), bn2d(cin), nn.ReLU6(inplace=True),
+            nn.Conv2d(cin, cout, kernel_size=3, stride=1, padding=1, bias=False), bn2d(cout),
+        )
+    def forward(self, x):
+        x = self.up_sample(x)
+        return self.conv(x)
+class LightDecoder(nn.Module):
+    def __init__(self, up_sample_ratio, width=768, sbn=True):   # todo: the decoder's width follows a simple halfing rule; you can change it to any other rule
+        super().__init__()
+        self.width = width
+        assert is_pow2n(up_sample_ratio)
+        n = round(math.log2(up_sample_ratio))
+        channels = [self.width // 2 ** i for i in range(n + 1)] # todo: the decoder's width follows a simple halfing rule; you can change it to any other rule
+        bn2d = nn.SyncBatchNorm if sbn else nn.BatchNorm2d
+        self.dec = nn.ModuleList([UNetBlock(cin, cout, bn2d) for (cin, cout) in zip(channels[:-1], channels[1:])])
+        self.proj = nn.Conv2d(channels[-1], 3, kernel_size=1, stride=1, bias=True)
+        self.initialize()
+    def forward(self, to_dec: List[torch.Tensor]):
+        x = 0
+        for i, d in enumerate(self.dec):
+            if i < len(to_dec) and to_dec[i] is not None:
+                x = x + to_dec[i]
+            x = self.dec[i](x)
+        return self.proj(x)
+    def extra_repr(self) -> str:
+        return f'width={self.width}'
+    def initialize(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Conv2d):
+                trunc_normal_(m.weight, std=.02)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0.)
+            elif isinstance(m, (nn.LayerNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.SyncBatchNorm)):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)

spark/pretrain/dist.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+from typing import List
+from typing import Union
+import sys
+import torch
+import torch.distributed as tdist
+import torch.multiprocessing as mp
+__rank, __local_rank, __world_size, __device = 0, 0, 1, 'cpu'
+__initialized = False
+def initialized():
+    return __initialized
+def initialize(backend='nccl'):
+    global __device
+    if not torch.cuda.is_available():
+        print(f'[dist initialize] cuda is not available, use cpu instead', file=sys.stderr)
+        return
+    elif 'RANK' not in os.environ:
+        __device = torch.empty(1).cuda().device
+        print(f'[dist initialize] RANK is not set, use 1 GPU instead', file=sys.stderr)
+        return
+    # ref: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py#L29
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    global_rank, num_gpus = int(os.environ['RANK']), torch.cuda.device_count()
+    local_rank = global_rank % num_gpus
+    torch.cuda.set_device(local_rank)
+    tdist.init_process_group(backend=backend)
+    global __rank, __local_rank, __world_size, __initialized
+    __local_rank = local_rank
+    __rank, __world_size = tdist.get_rank(), tdist.get_world_size()
+    __device = torch.empty(1).cuda().device
+    __initialized = True
+    assert tdist.is_initialized(), 'torch.distributed is not initialized!'
+def get_rank():
+    return __rank
+def get_local_rank():
+    return __local_rank
+def get_world_size():
+    return __world_size
+def get_device():
+    return __device
+def is_master():
+    return __rank == 0
+def is_local_master():
+    return __local_rank == 0
+def barrier():
+    if __initialized:
+        tdist.barrier()
+def parallelize(net, syncbn=False):
+    if syncbn:
+        net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
+    net = net.cuda()
+    net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[get_local_rank()], find_unused_parameters=False, broadcast_buffers=False)
+    return net
+def allreduce(t: torch.Tensor) -> None:
+    if __initialized:
+        if not t.is_cuda:
+            cu = t.detach().cuda()
+            tdist.all_reduce(cu)
+            t.copy_(cu.cpu())
+        else:
+            tdist.all_reduce(t)
+def allgather(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]:
+    if __initialized:
+        if not t.is_cuda:
+            t = t.cuda()
+        ls = [torch.empty_like(t) for _ in range(__world_size)]
+        tdist.all_gather(ls, t)
+    else:
+        ls = [t]
+    if cat:
+        ls = torch.cat(ls, dim=0)
+    return ls
+def broadcast(t: torch.Tensor, src_rank) -> None:
+    if __initialized:
+        if not t.is_cuda:
+            cu = t.detach().cuda()
+            tdist.broadcast(cu, src=src_rank)
+            t.copy_(cu.cpu())
+        else:
+            tdist.broadcast(t, src=src_rank)

spark/pretrain/encoder.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from timm.models.layers import DropPath
+_cur_active: torch.Tensor = None            # B1ff
+# todo: try to use `gather` for speed?
+def _get_active_ex_or_ii(H, W, returning_active_ex=True):
+    h_repeat, w_repeat = H // _cur_active.shape[-2], W // _cur_active.shape[-1]
+    active_ex = _cur_active.repeat_interleave(h_repeat, dim=2).repeat_interleave(w_repeat, dim=3)
+    return active_ex if returning_active_ex else active_ex.squeeze(1).nonzero(as_tuple=True)  # ii: bi, hi, wi
+def sp_conv_forward(self, x: torch.Tensor):
+    x = super(type(self), self).forward(x)
+    x *= _get_active_ex_or_ii(H=x.shape[2], W=x.shape[3], returning_active_ex=True)    # (BCHW) *= (B1HW), mask the output of conv
+    return x
+def sp_bn_forward(self, x: torch.Tensor):
+    ii = _get_active_ex_or_ii(H=x.shape[2], W=x.shape[3], returning_active_ex=False)
+    bhwc = x.permute(0, 2, 3, 1)
+    nc = bhwc[ii]                               # select the features on non-masked positions to form a flatten feature `nc`
+    nc = super(type(self), self).forward(nc)    # use BN1d to normalize this flatten feature `nc`
+    bchw = torch.zeros_like(bhwc)
+    bchw[ii] = nc
+    bchw = bchw.permute(0, 3, 1, 2)
+    return bchw
+class SparseConv2d(nn.Conv2d):
+    forward = sp_conv_forward   # hack: override the forward function; see `sp_conv_forward` above for more details
+class SparseMaxPooling(nn.MaxPool2d):
+    forward = sp_conv_forward   # hack: override the forward function; see `sp_conv_forward` above for more details
+class SparseAvgPooling(nn.AvgPool2d):
+    forward = sp_conv_forward   # hack: override the forward function; see `sp_conv_forward` above for more details
+class SparseBatchNorm2d(nn.BatchNorm1d):
+    forward = sp_bn_forward     # hack: override the forward function; see `sp_bn_forward` above for more details
+class SparseSyncBatchNorm2d(nn.SyncBatchNorm):
+    forward = sp_bn_forward     # hack: override the forward function; see `sp_bn_forward` above for more details
+class SparseConvNeXtLayerNorm(nn.LayerNorm):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last", sparse=True):
+        if data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        super().__init__(normalized_shape, eps, elementwise_affine=True)
+        self.data_format = data_format
+        self.sparse = sparse
+    def forward(self, x):
+        if x.ndim == 4: # BHWC or BCHW
+            if self.data_format == "channels_last": # BHWC
+                if self.sparse:
+                    ii = _get_active_ex_or_ii(H=x.shape[1], W=x.shape[2], returning_active_ex=False)
+                    nc = x[ii]
+                    nc = super(SparseConvNeXtLayerNorm, self).forward(nc)
+                    x = torch.zeros_like(x)
+                    x[ii] = nc
+                    return x
+                else:
+                    return super(SparseConvNeXtLayerNorm, self).forward(x)
+            else:       # channels_first, BCHW
+                if self.sparse:
+                    ii = _get_active_ex_or_ii(H=x.shape[2], W=x.shape[3], returning_active_ex=False)
+                    bhwc = x.permute(0, 2, 3, 1)
+                    nc = bhwc[ii]
+                    nc = super(SparseConvNeXtLayerNorm, self).forward(nc)
+                    x = torch.zeros_like(bhwc)
+                    x[ii] = nc
+                    return x.permute(0, 3, 1, 2)
+                else:
+                    u = x.mean(1, keepdim=True)
+                    s = (x - u).pow(2).mean(1, keepdim=True)
+                    x = (x - u) / torch.sqrt(s + self.eps)
+                    x = self.weight[:, None, None] * x + self.bias[:, None, None]
+                    return x
+        else:           # BLC or BC
+            if self.sparse:
+                raise NotImplementedError
+            else:
+                return super(SparseConvNeXtLayerNorm, self).forward(x)
+    def __repr__(self):
+        return super(SparseConvNeXtLayerNorm, self).__repr__()[:-1] + f', ch={self.data_format.split("_")[-1]}, sp={self.sparse})'
+class SparseConvNeXtBlock(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6, sparse=True, ks=7):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=ks, padding=ks//2, groups=dim)  # depthwise conv
+        self.norm = SparseConvNeXtLayerNorm(dim, eps=1e-6, sparse=sparse)
+        self.pwconv1 = nn.Linear(dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
+                                  requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path: nn.Module = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.sparse = sparse
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)            # GELU(0) == (0), so there is no need to mask x (no need to `x *= _get_active_ex_or_ii`)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+        if self.sparse:
+            x *= _get_active_ex_or_ii(H=x.shape[2], W=x.shape[3], returning_active_ex=True)
+        x = input + self.drop_path(x)
+        return x
+    def __repr__(self):
+        return super(SparseConvNeXtBlock, self).__repr__()[:-1] + f', sp={self.sparse})'
+class SparseEncoder(nn.Module):
+    def __init__(self, cnn, input_size, sbn=False, verbose=False):
+        super(SparseEncoder, self).__init__()
+        self.sp_cnn = SparseEncoder.dense_model_to_sparse(m=cnn, verbose=verbose, sbn=sbn)
+        self.input_size, self.downsample_raito, self.enc_feat_map_chs = input_size, cnn.get_downsample_ratio(), cnn.get_feature_map_channels()
+    @staticmethod
+    def dense_model_to_sparse(m: nn.Module, verbose=False, sbn=False):
+        oup = m
+        if isinstance(m, nn.Conv2d):
+            m: nn.Conv2d
+            bias = m.bias is not None
+            oup = SparseConv2d(
+                m.in_channels, m.out_channels,
+                kernel_size=m.kernel_size, stride=m.stride, padding=m.padding,
+                dilation=m.dilation, groups=m.groups, bias=bias, padding_mode=m.padding_mode,
+            )
+            oup.weight.data.copy_(m.weight.data)
+            if bias:
+                oup.bias.data.copy_(m.bias.data)
+        elif isinstance(m, nn.MaxPool2d):
+            m: nn.MaxPool2d
+            oup = SparseMaxPooling(m.kernel_size, stride=m.stride, padding=m.padding, dilation=m.dilation, return_indices=m.return_indices, ceil_mode=m.ceil_mode)
+        elif isinstance(m, nn.AvgPool2d):
+            m: nn.AvgPool2d
+            oup = SparseAvgPooling(m.kernel_size, m.stride, m.padding, ceil_mode=m.ceil_mode, count_include_pad=m.count_include_pad, divisor_override=m.divisor_override)
+        elif isinstance(m, (nn.BatchNorm2d, nn.SyncBatchNorm)):
+            m: nn.BatchNorm2d
+            oup = (SparseSyncBatchNorm2d if sbn else SparseBatchNorm2d)(m.weight.shape[0], eps=m.eps, momentum=m.momentum, affine=m.affine, track_running_stats=m.track_running_stats)
+            oup.weight.data.copy_(m.weight.data)
+            oup.bias.data.copy_(m.bias.data)
+            oup.running_mean.data.copy_(m.running_mean.data)
+            oup.running_var.data.copy_(m.running_var.data)
+            oup.num_batches_tracked.data.copy_(m.num_batches_tracked.data)
+            if hasattr(m, "qconfig"):
+                oup.qconfig = m.qconfig
+        elif isinstance(m, nn.LayerNorm) and not isinstance(m, SparseConvNeXtLayerNorm):
+            m: nn.LayerNorm
+            oup = SparseConvNeXtLayerNorm(m.weight.shape[0], eps=m.eps)
+            oup.weight.data.copy_(m.weight.data)
+            oup.bias.data.copy_(m.bias.data)
+        elif isinstance(m, (nn.Conv1d,)):
+            raise NotImplementedError
+        for name, child in m.named_children():
+            oup.add_module(name, SparseEncoder.dense_model_to_sparse(child, verbose=verbose, sbn=sbn))
+        del m
+        return oup
+    def forward(self, x):
+        return self.sp_cnn(x, hierarchical=True)

spark/pretrain/main.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import datetime
+import math
+import sys
+import time
+from functools import partial
+from typing import List
+import torch
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader
+import dist
+import encoder
+from decoder import LightDecoder
+from models import build_sparse_encoder
+from sampler import DistInfiniteBatchSampler, worker_init_fn
+from spark import SparK
+from utils import arg_util, misc, lamb
+from utils.imagenet import build_dataset_to_pretrain
+from utils.lr_control import lr_wd_annealing, get_param_groups
+class LocalDDP(torch.nn.Module):
+    def __init__(self, module):
+        super(LocalDDP, self).__init__()
+        self.module = module
+    def forward(self, *args, **kwargs):
+        return self.module(*args, **kwargs)
+def main_pt():
+    args: arg_util.Args = arg_util.init_dist_and_get_args()
+    print(f'initial args:\n{str(args)}')
+    args.log_epoch()
+    # build data
+    print(f'[build data for pre-training] ...\n')
+    dataset_train = build_dataset_to_pretrain(args.data_path, args.input_size)
+    data_loader_train = DataLoader(
+        dataset=dataset_train, num_workers=args.dataloader_workers, pin_memory=True,
+        batch_sampler=DistInfiniteBatchSampler(
+            dataset_len=len(dataset_train), glb_batch_size=args.glb_batch_size,
+            shuffle=True, filling=True, rank=dist.get_rank(), world_size=dist.get_world_size(),
+        ), worker_init_fn=worker_init_fn
+    )
+    itrt_train, iters_train = iter(data_loader_train), len(data_loader_train)
+    print(f'[dataloader] gbs={args.glb_batch_size}, lbs={args.batch_size_per_gpu}, iters_train={iters_train}')
+    # build encoder and decoder
+    enc: encoder.SparseEncoder = build_sparse_encoder(args.model, input_size=args.input_size, sbn=args.sbn, drop_path_rate=args.dp, verbose=False)
+    dec = LightDecoder(enc.downsample_raito, sbn=args.sbn)
+    model_without_ddp = SparK(
+        sparse_encoder=enc, dense_decoder=dec, mask_ratio=args.mask,
+        densify_norm=args.densify_norm, sbn=args.sbn,
+    ).to(args.device)
+    print(f'[PT model] model = {model_without_ddp}\n')
+    # the model has been randomly initialized in their construction time
+    # now try to load some checkpoint as model weight initialization; this ONLY loads the model weights
+    misc.initialize_weight(args.init_weight, model_without_ddp)
+    if dist.initialized():
+        model: DistributedDataParallel = DistributedDataParallel(model_without_ddp, device_ids=[dist.get_local_rank()], find_unused_parameters=False, broadcast_buffers=False)
+    else:
+        model = LocalDDP(model_without_ddp)
+    # build optimizer and lr_scheduler
+    param_groups: List[dict] = get_param_groups(model_without_ddp, nowd_keys={'cls_token', 'pos_embed', 'mask_token', 'gamma'})
+    opt_clz = {
+        'sgd': partial(torch.optim.SGD, momentum=0.9, nesterov=True),
+        'adamw': partial(torch.optim.AdamW, betas=(0.9, args.ada)),
+        'lamb': partial(lamb.TheSameAsTimmLAMB, betas=(0.9, args.ada), max_grad_norm=5.0),
+    }[args.opt]
+    optimizer = opt_clz(params=param_groups, lr=args.lr, weight_decay=0.0)
+    print(f'[optimizer] optimizer({opt_clz}) ={optimizer}\n')
+    # try to resume the experiment from some checkpoint.pth; this will load model weights, optimizer states, and last epoch (ep_start)
+    # if loaded, ep_start will be greater than 0
+    ep_start, performance_desc = misc.load_checkpoint(args.resume_from, model_without_ddp, optimizer)
+    if ep_start >= args.ep: # load from a complete checkpoint file
+        print(f'  [*] [PT already done]    Min/Last Recon Loss: {performance_desc}')
+    else:   # perform pre-training
+        tb_lg = misc.TensorboardLogger(args.tb_lg_dir, is_master=dist.is_master(), prefix='pt')
+        min_loss = 1e9
+        print(f'[PT start] from ep{ep_start}')
+        pt_start_time = time.time()
+        for ep in range(ep_start, args.ep):
+            ep_start_time = time.time()
+            tb_lg.set_step(ep * iters_train)
+            if hasattr(itrt_train, 'set_epoch'):
+                itrt_train.set_epoch(ep)
+            stats = pre_train_one_ep(ep, args, tb_lg, itrt_train, iters_train, model, optimizer)
+            last_loss = stats['last_loss']
+            min_loss = min(min_loss, last_loss)
+            performance_desc = f'{min_loss:.4f} {last_loss:.4f}'
+            misc.save_checkpoint_with_meta_info_and_opt_state(f'{args.model}_withdecoder_1kpretrained_spark_style.pth', args, ep, performance_desc, model_without_ddp.state_dict(with_config=True), optimizer.state_dict())
+            misc.save_checkpoint_model_weights_only(f'{args.model}_1kpretrained_timm_style.pth', args, model_without_ddp.sparse_encoder.sp_cnn.state_dict())
+            ep_cost = round(time.time() - ep_start_time, 2) + 1    # +1s: approximate the following logging cost
+            remain_secs = (args.ep-1 - ep) * ep_cost
+            remain_time = datetime.timedelta(seconds=round(remain_secs))
+            finish_time = time.strftime("%m-%d %H:%M", time.localtime(time.time() + remain_secs))
+            print(f'  [*] [ep{ep}/{args.ep}]    Min/Last Recon Loss: {performance_desc},    Cost: {ep_cost}s,    Remain: {remain_time},    Finish @ {finish_time}')
+            args.cur_ep = f'{ep + 1}/{args.ep}'
+            args.remain_time, args.finish_time = str(remain_time), str(finish_time)
+            args.last_loss = last_loss
+            args.log_epoch()
+            tb_lg.update(min_loss=min_loss, head='train', step=ep)
+            tb_lg.update(rest_hours=round(remain_secs/60/60, 2), head='z_burnout', step=ep)
+            tb_lg.flush()
+        # finish pre-training
+        tb_lg.update(min_loss=min_loss, head='result', step=ep_start)
+        tb_lg.update(min_loss=min_loss, head='result', step=args.ep)
+        tb_lg.flush()
+        print(f'final args:\n{str(args)}')
+        print('\n\n')
+        print(f'  [*] [PT finished]    Min/Last Recon Loss: {performance_desc},    Total Cost: {(time.time() - pt_start_time) / 60 / 60:.1f}h\n')
+        print('\n\n')
+        tb_lg.close()
+        time.sleep(10)
+    args.remain_time, args.finish_time = '-', time.strftime("%m-%d %H:%M", time.localtime(time.time()))
+    args.log_epoch()
+def pre_train_one_ep(ep, args: arg_util.Args, tb_lg: misc.TensorboardLogger, itrt_train, iters_train, model: DistributedDataParallel, optimizer):
+    model.train()
+    me = misc.MetricLogger(delimiter='  ')
+    me.add_meter('max_lr', misc.SmoothedValue(window_size=1, fmt='{value:.5f}'))
+    header = f'[PT] Epoch {ep}:'
+    optimizer.zero_grad()
+    early_clipping = args.clip > 0 and not hasattr(optimizer, 'global_grad_norm')
+    late_clipping = hasattr(optimizer, 'global_grad_norm')
+    if early_clipping:
+        params_req_grad = [p for p in model.parameters() if p.requires_grad]
+    for it, inp in enumerate(me.log_every(iters_train, itrt_train, 3, header)):
+        # adjust lr and wd
+        min_lr, max_lr, min_wd, max_wd = lr_wd_annealing(optimizer, args.lr, args.wd, args.wde, it + ep * iters_train, args.wp_ep * iters_train, args.ep * iters_train)
+        # forward and backward
+        inp = inp.to(args.device, non_blocking=True)
+        SparK.forward
+        loss = model(inp, active_b1ff=None, vis=False)
+        optimizer.zero_grad()
+        loss.backward()
+        loss = loss.item()
+        if not math.isfinite(loss):
+            print(f'[rk{dist.get_rank():02d}] Loss is {loss}, stopping training!', force=True, flush=True)
+            sys.exit(-1)
+        # optimize
+        grad_norm = None
+        if early_clipping: grad_norm = torch.nn.utils.clip_grad_norm_(params_req_grad, args.clip).item()
+        optimizer.step()
+        if late_clipping: grad_norm = optimizer.global_grad_norm
+        torch.cuda.synchronize()
+        # log
+        me.update(last_loss=loss)
+        me.update(max_lr=max_lr)
+        tb_lg.update(loss=me.meters['last_loss'].global_avg, head='train_loss')
+        tb_lg.update(sche_lr=max_lr, head='train_hp/lr_max')
+        tb_lg.update(sche_lr=min_lr, head='train_hp/lr_min')
+        tb_lg.update(sche_wd=max_wd, head='train_hp/wd_max')
+        tb_lg.update(sche_wd=min_wd, head='train_hp/wd_min')
+        if grad_norm is not None:
+            me.update(orig_norm=grad_norm)
+            tb_lg.update(orig_norm=grad_norm, head='train_hp')
+        tb_lg.set_step()
+    me.synchronize_between_processes()
+    return {k: meter.global_avg for k, meter in me.meters.items()}
+if __name__ == '__main__':
+    main_pt()

spark/pretrain/models/__init__.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from timm import create_model
+from timm.loss import SoftTargetCrossEntropy
+from timm.models.layers import drop
+from models.convnext import ConvNeXt
+from models.resnet import ResNet
+from models.custom import YourConvNet
+_import_resnets_for_timm_registration = (ResNet,)
+# log more
+def _ex_repr(self):
+    return ', '.join(
+        f'{k}=' + (f'{v:g}' if isinstance(v, float) else str(v))
+        for k, v in vars(self).items()
+        if not k.startswith('_') and k != 'training'
+        and not isinstance(v, (torch.nn.Module, torch.Tensor))
+    )
+for clz in (torch.nn.CrossEntropyLoss, SoftTargetCrossEntropy, drop.DropPath):
+    if hasattr(clz, 'extra_repr'):
+        clz.extra_repr = _ex_repr
+    else:
+        clz.__repr__ = lambda self: f'{type(self).__name__}({_ex_repr(self)})'
+pretrain_default_model_kwargs = {
+    'V9back': dict(),
+    'resnet50': dict(drop_path_rate=0.05),
+    'resnet101': dict(drop_path_rate=0.08),
+    'resnet152': dict(drop_path_rate=0.10),
+    'resnet200': dict(drop_path_rate=0.15),
+    'convnext_small': dict(sparse=True, drop_path_rate=0.2),
+    'convnext_base': dict(sparse=True, drop_path_rate=0.3),
+    'convnext_large': dict(sparse=True, drop_path_rate=0.4),
+}
+for kw in pretrain_default_model_kwargs.values():
+    kw['pretrained'] = False
+    kw['num_classes'] = 0
+    kw['global_pool'] = ''
+def build_sparse_encoder(name: str, input_size: int, sbn=False, drop_path_rate=0.0, verbose=False):
+    from encoder import SparseEncoder
+    kwargs = pretrain_default_model_kwargs[name]
+    if drop_path_rate != 0:
+        kwargs['drop_path_rate'] = drop_path_rate
+    print(f'[build_sparse_encoder] model kwargs={kwargs}')
+    cnn = create_model(name, **kwargs)
+    return SparseEncoder(cnn, input_size=input_size, sbn=sbn, verbose=verbose)

spark/pretrain/models/convnext.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# This file is basically a copy of: https://github.com/facebookresearch/ConvNeXt/blob/06f7b05f922e21914916406141f50f82b4a15852/models/convnext.py
+from typing import List
+import torch
+import torch.nn as nn
+from timm.models.layers import trunc_normal_
+from timm.models.registry import register_model
+from encoder import SparseConvNeXtBlock, SparseConvNeXtLayerNorm
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(self, in_chans=3, num_classes=1000,
+                 depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0.,
+                 layer_scale_init_value=1e-6, head_init_scale=1., global_pool='avg',
+                 sparse=True,
+                 ):
+        super().__init__()
+        self.dims: List[int] = dims
+        self.downsample_layers = nn.ModuleList()  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            SparseConvNeXtLayerNorm(dims[0], eps=1e-6, data_format="channels_first", sparse=sparse)
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                SparseConvNeXtLayerNorm(dims[i], eps=1e-6, data_format="channels_first", sparse=sparse),
+                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+        self.stages = nn.ModuleList()  # 4 feature resolution stages, each consisting of multiple residual blocks
+        self.drop_path_rate = drop_path_rate
+        self.layer_scale_init_value = layer_scale_init_value
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(
+                *[SparseConvNeXtBlock(dim=dims[i], drop_path=dp_rates[cur + j],
+                                      layer_scale_init_value=layer_scale_init_value, sparse=sparse) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.depths = depths
+        self.apply(self._init_weights)
+        if num_classes > 0:
+            self.norm = SparseConvNeXtLayerNorm(dims[-1], eps=1e-6, sparse=False)  # final norm layer for LE/FT; should not be sparse
+            self.fc = nn.Linear(dims[-1], num_classes)
+        else:
+            self.norm = nn.Identity()
+            self.fc = nn.Identity()
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+    def get_downsample_ratio(self) -> int:
+        return 32
+    def get_feature_map_channels(self) -> List[int]:
+        return self.dims
+    def forward(self, x, hierarchical=False):
+        if hierarchical:
+            ls = []
+            for i in range(4):
+                x = self.downsample_layers[i](x)
+                x = self.stages[i](x)
+                ls.append(x)
+            return ls
+        else:
+            return self.fc(self.norm(x.mean([-2, -1]))) # (B, C, H, W) =mean=> (B, C) =norm&fc=> (B, NumCls)
+    def get_classifier(self):
+        return self.fc
+    def extra_repr(self):
+        return f'drop_path_rate={self.drop_path_rate}, layer_scale_init_value={self.layer_scale_init_value:g}'
+@register_model
+def convnext_tiny(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    return model
+@register_model
+def convnext_small(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    return model
+@register_model
+def convnext_base(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    return model
+@register_model
+def convnext_large(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    return model

spark/pretrain/models/custom.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from typing import List
+from timm.models.registry import register_model
+import torch
+from torch import nn
+import sys
+from  HG.HGBlock import HGStem,HGBlock
+from  HG.block import DWConv
+from v9back.common import *
+class YourConvNet(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.mlist=nn.ModuleList(
+            [Silence(),
+             Bbackbone(),
+             ]
+        )
+        self.d0=  Down0(64)
+        self.d1 = Down1(128)
+        self.d2 = Down2(256)
+        self.d3 = Down3(512)
+        self.d4 = Down4(1024)
+        self.alld = [self.d0,self.d1,self.d2,self.d3,self.d4]
+        self.cblinear1 = CBLinear(64,[64])
+        self.cblinear3 = CBLinear(128, [64, 128])
+        self.cblinear5 = CBLinear(256, [64, 128, 256])
+        self.cblinear7 = CBLinear(512, [64, 128, 256, 512])
+        self.cblinear9 = CBLinear(1024, [64, 128, 256, 512, 1024])
+        self.allcblinear = [self.cblinear1,self.cblinear3,self.cblinear5,self.cblinear7,self.cblinear9]
+        # # conv down 1
+        self.conv1 = Conv(3, 64, 3, 2 )
+        self.cbfuse1 = CBFuse([0, 0, 0, 0, 0])
+        ## conv down 2
+        self.conv2= Conv(64, 128, 3, 2)
+        self.cbfuse2 = CBFuse([1, 1, 1, 1])
+        self.rep2 = RepNCSPELAN4(128, 256, 128, 64, 2)
+        ##   avg-conv down fuse 1
+        self.adown3 = ADown(256, 256)
+        self.cbfuse3 = CBFuse([2, 2, 2])
+        self.rep3 = RepNCSPELAN4(256, 512, 256, 128, 2)
+        ##   avg-conv down fuse 2
+        self.adown4 = ADown(512, 512)
+        self.cbfuse4 = CBFuse([3,3])
+        self.rep4 = RepNCSPELAN4(512, 1024, 512, 256, 2)
+        ##   avg-conv down fuse 3
+        self.adown5 = ADown(1024, 1024)
+        self.cbfuse5 = CBFuse([4])
+        self.rep5 = RepNCSPELAN4(1024, 1024, 512, 256, 2)
+    def get_downsample_ratio(self) -> int:
+        return 32
+    def get_feature_map_channels(self) -> List[int]:
+        return [  256,512,1024,1024]
+    def forward(self, x: torch.Tensor, hierarchical=False):
+        if hierarchical:
+            origin = x.clone()
+            ls = []
+            tmp = []
+            bx = None
+            for index,modules in enumerate( self.mlist):
+                x = modules(x)
+                if index ==1:
+                    bx = x
+            for i in  range(5):
+                tmp.append(self.allcblinear[i](self.alld[i](bx)))
+            fuse1 = self.cbfuse1([tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],self.conv1(origin)])
+            fuse2 = self.cbfuse2([tmp[1],tmp[2],tmp[3],tmp[4],self.conv2(fuse1)])
+            fuse2 = self.rep2(fuse2)
+            fuse3= self.cbfuse3([ tmp[2], tmp[3], tmp[4], self.adown3(fuse2)])
+            fuse3 = self.rep3(fuse3)
+            fuse4 = self.cbfuse4([tmp[3], tmp[4], self.adown4(fuse3)])
+            fuse4 = self.rep4(fuse4)
+            fuse5 = self.cbfuse5([tmp[4], self.adown5(fuse4)])
+            fuse5 = self.rep5(fuse5)
+            ls.append(fuse2)
+            ls.append(fuse3)
+            ls.append(fuse4)
+            ls.append(fuse5)
+            return ls
+        else:
+            for modules in self.mlist:
+                x = modules(x)
+        return x
+@register_model
+def V9back(pretrained=False, **kwargs):
+    return YourConvNet(**kwargs)
+@torch.no_grad()
+def convnet_test():
+    from timm.models import create_model
+    cnn = create_model('V9back')
+    print('get_downsample_ratio:', cnn.get_downsample_ratio())
+    print('get_feature_map_channels:', cnn.get_feature_map_channels())
+    downsample_ratio = cnn.get_downsample_ratio()
+    feature_map_channels = cnn.get_feature_map_channels()
+    # check the forward function
+    B, C, H, W = 4, 3, 224, 224
+    inp = torch.rand(B, C, H, W)
+    feats = cnn(inp, hierarchical=True)
+    assert isinstance(feats, list)
+    assert len(feats) == len(feature_map_channels)
+    print([tuple(t.shape) for t in feats])
+    # check the downsample ratio
+    feats = cnn(inp, hierarchical=True)
+    assert feats[-1].shape[-2] == H // downsample_ratio
+    assert feats[-1].shape[-1] == W // downsample_ratio
+    # check the channel number
+    for feat, ch in zip(feats, feature_map_channels):
+        assert feat.ndim == 4
+        assert feat.shape[1] == ch
+if __name__ == '__main__':
+    convnet_test()

spark/pretrain/models/custom_detr.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from typing import List
+from timm.models.registry import register_model
+import torch
+from torch import nn
+import sys
+from  HG.HGBlock import HGStem,HGBlock
+from  HG.block import DWConv
+class YourConvNet(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.mlist=nn.ModuleList(
+            [HGStem(3, 32, 64),
+            HGBlock(64, 64, 128, 3, n=6),
+            DWConv(128, 128, 3, 2, 1, False),
+            HGBlock(128, 128, 512, 3, n=6),
+            HGBlock(512, 128, 512, 3, lightconv=False,shortcut=True,n=6),
+            DWConv(512, 512, 3, 2, 1, False),
+            HGBlock(512, 256, 1024, 5,lightconv=True,shortcut=False,n=6),
+            HGBlock(1024, 256, 1024, 5, lightconv=True, shortcut=True, n=6),
+            HGBlock(1024, 256, 1024, 5, lightconv=True, shortcut=True, n=6),
+            HGBlock(1024, 256, 1024, 5, lightconv=True, shortcut=True, n=6),
+            HGBlock(1024, 256, 1024, 5, lightconv=True, shortcut=True, n=6),
+            DWConv(1024, 1024, 3, 2, 1, False),
+            HGBlock(1024, 512, 2048, 5, lightconv=True, shortcut=False, n=6),
+             HGBlock(2048, 512, 2048, 5, lightconv=True, shortcut=True, n=6)
+             ]
+        )
+    def get_downsample_ratio(self) -> int:
+        return 32
+    def get_feature_map_channels(self) -> List[int]:
+        return [128,512,1024,2048]
+    def forward(self, x: torch.Tensor, hierarchical=False):
+        if hierarchical:
+            ls = []
+            for index,modules in enumerate( self.mlist):
+                x = modules(x)
+                if index in [1,4,10,13]:
+                    ls.append(x)
+            return ls
+        else:
+            for modules in self.mlist:
+                x = modules(x)
+        return x
+@register_model
+def HGNetv2(pretrained=False, **kwargs):
+    return YourConvNet(**kwargs)
+@torch.no_grad()
+def convnet_test():
+    from timm.models import create_model
+    cnn = create_model('HGNetv2')
+    print('get_downsample_ratio:', cnn.get_downsample_ratio())
+    print('get_feature_map_channels:', cnn.get_feature_map_channels())
+    downsample_ratio = cnn.get_downsample_ratio()
+    feature_map_channels = cnn.get_feature_map_channels()
+    # check the forward function
+    B, C, H, W = 4, 3, 224, 224
+    inp = torch.rand(B, C, H, W)
+    feats = cnn(inp, hierarchical=True)
+    assert isinstance(feats, list)
+    assert len(feats) == len(feature_map_channels)
+    print([tuple(t.shape) for t in feats])
+    # check the downsample ratio
+    feats = cnn(inp, hierarchical=True)
+    assert feats[-1].shape[-2] == H // downsample_ratio
+    assert feats[-1].shape[-1] == W // downsample_ratio
+    # check the channel number
+    for feat, ch in zip(feats, feature_map_channels):
+        assert feat.ndim == 4
+        assert feat.shape[1] == ch
+if __name__ == '__main__':
+    convnet_test()

spark/pretrain/models/custom_origin.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from typing import List
+from timm.models.registry import register_model
+class YourConvNet(nn.Module):
+    """
+    This is a template for your custom ConvNet.
+    It is required to implement the following three functions: `get_downsample_ratio`, `get_feature_map_channels`, `forward`.
+    You can refer to the implementations in `pretrain\models\resnet.py` for an example.
+    """
+    def get_downsample_ratio(self) -> int:
+        """
+        This func would ONLY be used in `SparseEncoder's __init__` (see `pretrain/encoder.py`).
+        :return: the TOTAL downsample ratio of the ConvNet.
+        E.g., for a ResNet-50, this should return 32.
+        """
+        raise NotImplementedError
+    def get_feature_map_channels(self) -> List[int]:
+        """
+        This func would ONLY be used in `SparseEncoder's __init__` (see `pretrain/encoder.py`).
+        :return: a list of the number of channels of each feature map.
+        E.g., for a ResNet-50, this should return [256, 512, 1024, 2048].
+        """
+        raise NotImplementedError
+    def forward(self, inp_bchw: torch.Tensor, hierarchical=False):
+        """
+        The forward with `hierarchical=True` would ONLY be used in `SparseEncoder.forward` (see `pretrain/encoder.py`).
+        :param inp_bchw: input image tensor, shape: (batch_size, channels, height, width).
+        :param hierarchical: return the logits (not hierarchical), or the feature maps (hierarchical).
+        :return:
+            - hierarchical == False: return the logits of the classification task, shape: (batch_size, num_classes).
+            - hierarchical == True: return a list of all feature maps, which should have the same length as the return value of `get_feature_map_channels`.
+              E.g., for a ResNet-50, it should return a list [1st_feat_map, 2nd_feat_map, 3rd_feat_map, 4th_feat_map].
+                    for an input size of 224, the shapes are [(B, 256, 56, 56), (B, 512, 28, 28), (B, 1024, 14, 14), (B, 2048, 7, 7)]
+        """
+        raise NotImplementedError
+@register_model
+def your_convnet_small(pretrained=False, **kwargs):
+    raise NotImplementedError
+    return YourConvNet(**kwargs)
+@torch.no_grad()
+def convnet_test():
+    from timm.models import create_model
+    cnn = create_model('your_convnet_small')
+    print('get_downsample_ratio:', cnn.get_downsample_ratio())
+    print('get_feature_map_channels:', cnn.get_feature_map_channels())
+    downsample_ratio = cnn.get_downsample_ratio()
+    feature_map_channels = cnn.get_feature_map_channels()
+    # check the forward function
+    B, C, H, W = 4, 3, 224, 224
+    inp = torch.rand(B, C, H, W)
+    feats = cnn(inp, hierarchical=True)
+    assert isinstance(feats, list)
+    assert len(feats) == len(feature_map_channels)
+    print([tuple(t.shape) for t in feats])
+    # check the downsample ratio
+    feats = cnn(inp, hierarchical=True)
+    assert feats[-1].shape[-2] == H // downsample_ratio
+    assert feats[-1].shape[-1] == W // downsample_ratio
+    # check the channel number
+    for feat, ch in zip(feats, feature_map_channels):
+        assert feat.ndim == 4
+        assert feat.shape[1] == ch
+if __name__ == '__main__':
+    convnet_test()