from typing import Union, Tuple
from torch import nn

from .conv_bn_act_block import ConvBNAct
from .repvgg_block import RepVGGBlock
from .se_blocks import SEBlock


def ConvBNReLU(
    in_channels: int,
    out_channels: int,
    kernel_size: Union[int, Tuple[int, int]],
    stride: Union[int, Tuple[int, int]] = 1,
    padding: Union[int, Tuple[int, int]] = 0,
    dilation: Union[int, Tuple[int, int]] = 1,
    groups: int = 1,
    bias: bool = True,
    padding_mode: str = "zeros",
    use_normalization: bool = True,
    eps: float = 1e-5,
    momentum: float = 0.1,
    affine: bool = True,
    track_running_stats: bool = True,
    device=None,
    dtype=None,
    use_activation: bool = True,
    inplace: bool = False,
):
    """
    Class for Convolution2d-Batchnorm2d-Relu layer. Default behaviour is Conv-BN-Relu. To exclude Batchnorm module use
        `use_normalization=False`, to exclude Relu activation use `use_activation=False`.

    It exists to keep backward compatibility and will be superseeded by ConvBNAct in future releases.
    For new classes please use ConvBNAct instead.

    For convolution arguments documentation see `nn.Conv2d`.
    For batchnorm arguments documentation see `nn.BatchNorm2d`.
    For relu arguments documentation see `nn.Relu`.
    """
    return ConvBNAct(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        groups=groups,
        bias=bias,
        padding_mode=padding_mode,
        use_normalization=use_normalization,
        eps=eps,
        momentum=momentum,
        affine=affine,
        track_running_stats=track_running_stats,
        device=device,
        dtype=dtype,
        activation_type=nn.ReLU if use_activation else None,
        activation_kwargs=dict(inplace=inplace),
    )


__all__ = ["ConvBNAct", "RepVGGBlock", "SEBlock", "ConvBNReLU"]

          
 
            from typing import Union, Tuple, Type

from torch import nn


class ConvBNAct(nn.Module):
    """
    Class for Convolution2d-Batchnorm2d-Activation layer.
        Default behaviour is Conv-BN-Act. To exclude Batchnorm module use
        `use_normalization=False`, to exclude activation use `activation_type=None`.
    For convolution arguments documentation see `nn.Conv2d`.
    For batchnorm arguments documentation see `nn.BatchNorm2d`.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, Tuple[int, int]],
        padding: Union[int, Tuple[int, int]],
        activation_type: Type[nn.Module],
        stride: Union[int, Tuple[int, int]] = 1,
        dilation: Union[int, Tuple[int, int]] = 1,
        groups: int = 1,
        bias: bool = True,
        padding_mode: str = "zeros",
        use_normalization: bool = True,
        eps: float = 1e-5,
        momentum: float = 0.1,
        affine: bool = True,
        track_running_stats: bool = True,
        device=None,
        dtype=None,
        activation_kwargs=None,
    ):

        super().__init__()
        if activation_kwargs is None:
            activation_kwargs = {}

        self.seq = nn.Sequential()
        self.seq.add_module(
            "conv",
            nn.Conv2d(
                in_channels,
                out_channels,
                kernel_size=kernel_size,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
                bias=bias,
                padding_mode=padding_mode,
            ),
        )

        if use_normalization:
            self.seq.add_module(
                "bn",
                nn.BatchNorm2d(out_channels, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats, device=device, dtype=dtype),
            )
        if activation_type is not None:
            self.seq.add_module("act", activation_type(**activation_kwargs))

    def forward(self, x):
        return self.seq(x)

          
 
            from typing import Type, Union, Mapping, Any

import numpy as np
import torch
from torch import nn


class RepVGGBlock(nn.Module):
    """
    Repvgg block consists of three branches
    3x3: a branch of a 3x3 Convolution + BatchNorm + Activation
    1x1: a branch of a 1x1 Convolution + BatchNorm + Activation
    no_conv_branch: a branch with only BatchNorm which will only be used if
        input channel == output channel and use_residual_connection is True
    (usually in all but the first block of each stage)
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        activation_type: Type[nn.Module],
        se_type: Type[nn.Module],
        stride=1,
        dilation=1,
        groups=1,
        activation_kwargs: Union[Mapping[str, Any], None] = None,
        se_kwargs: Union[Mapping[str, Any], None] = None,
        build_residual_branches: bool = True,
        use_residual_connection: bool = True,
        use_alpha: bool = False,
    ):
        """

        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param activation_type: Type of the nonlinearity
        :param se_type: Type of the se block (Use nn.Identity to disable SE)
        :param stride: Output stride
        :param dilation: Dilation factor for 3x3 conv
        :param groups: Number of groups used in convolutions
        :param activation_kwargs: Additional arguments for instantiating activation module.
        :param se_kwargs: Additional arguments for instantiating SE module.
        :param build_residual_branches: Whether to initialize block with already fused paramters (for deployment)
        :param use_residual_connection: Whether to add input x to the output (Enabled in RepVGG, disabled in PP-Yolo)
        :param use_alpha: If True, enables additional learnable weighting parameter for 1x1 branch (PP-Yolo-E Plus)
        """
        super().__init__()

        if activation_kwargs is None:
            activation_kwargs = {}
        if se_kwargs is None:
            se_kwargs = {}

        self.groups = groups
        self.in_channels = in_channels

        self.nonlinearity = activation_type(**activation_kwargs)
        self.se = se_type(**se_kwargs)

        if use_residual_connection and out_channels == in_channels and stride == 1:
            self.no_conv_branch = nn.BatchNorm2d(num_features=in_channels)
        else:
            self.no_conv_branch = None

        self.branch_3x3 = self._conv_bn(
            in_channels=in_channels,
            out_channels=out_channels,
            dilation=dilation,
            kernel_size=3,
            stride=stride,
            padding=1,
            groups=groups,
        )
        self.branch_1x1 = self._conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=0, groups=groups)

        if use_alpha:
            self.alpha = torch.nn.Parameter(torch.tensor([1.0]), requires_grad=True)
        else:
            self.alpha = 1

        if not build_residual_branches:
            self.fuse_block_residual_branches()
        else:
            self.build_residual_branches = True

    def forward(self, inputs):
        if not self.build_residual_branches:
            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))

        if self.no_conv_branch is None:
            id_out = 0
        else:
            id_out = self.no_conv_branch(inputs)

        return self.nonlinearity(self.se(self.branch_3x3(inputs) + self.alpha * self.branch_1x1(inputs) + id_out))

    def _get_equivalent_kernel_bias(self):
        """
        Fuses the 3x3, 1x1 and identity branches into a single 3x3 conv layer
        """
        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.branch_3x3)
        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.branch_1x1)
        kernelid, biasid = self._fuse_bn_tensor(self.no_conv_branch)
        return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + self.alpha * bias1x1 + biasid

    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
        """
        padding the 1x1 convolution weights with zeros to be able to fuse the 3x3 conv layer with the 1x1
        :param kernel1x1: weights of the 1x1 convolution
        :type kernel1x1:
        :return: padded 1x1 weights
        :rtype:
        """
        if kernel1x1 is None:
            return 0
        else:
            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])

    def _fuse_bn_tensor(self, branch):
        """
        Fusing of the batchnorm into the conv layer.
        If the branch is the identity branch (no conv) the kernel will simply be eye.
        :param branch:
        :type branch:
        :return:
        :rtype:
        """
        if branch is None:
            return 0, 0
        if isinstance(branch, nn.Sequential):
            kernel = branch.conv.weight
            running_mean = branch.bn.running_mean
            running_var = branch.bn.running_var
            gamma = branch.bn.weight
            beta = branch.bn.bias
            eps = branch.bn.eps
        else:
            assert isinstance(branch, nn.BatchNorm2d)
            if not hasattr(self, "id_tensor"):
                input_dim = self.in_channels // self.groups
                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32)
                for i in range(self.in_channels):
                    kernel_value[i, i % input_dim, 1, 1] = 1
                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
            kernel = self.id_tensor
            running_mean = branch.running_mean
            running_var = branch.running_var
            gamma = branch.weight
            beta = branch.bias
            eps = branch.eps
        std = (running_var + eps).sqrt()
        t = (gamma / std).reshape(-1, 1, 1, 1)
        return kernel * t, beta - running_mean * gamma / std

    def fuse_block_residual_branches(self):
        """
        converts a repvgg block from training model (with branches) to deployment mode (vgg like model)
        :return:
        :rtype:
        """
        if hasattr(self, "build_residual_branches") and not self.build_residual_branches:
            return
        kernel, bias = self._get_equivalent_kernel_bias()
        self.rbr_reparam = nn.Conv2d(
            in_channels=self.branch_3x3.conv.in_channels,
            out_channels=self.branch_3x3.conv.out_channels,
            kernel_size=self.branch_3x3.conv.kernel_size,
            stride=self.branch_3x3.conv.stride,
            padding=self.branch_3x3.conv.padding,
            dilation=self.branch_3x3.conv.dilation,
            groups=self.branch_3x3.conv.groups,
            bias=True,
        )
        self.rbr_reparam.weight.data = kernel
        self.rbr_reparam.bias.data = bias
        for para in self.parameters():
            para.detach_()
        self.__delattr__("branch_3x3")
        self.__delattr__("branch_1x1")
        if hasattr(self, "no_conv_branch"):
            self.__delattr__("no_conv_branch")
        if hasattr(self, "alpha"):
            self.__delattr__("alpha")
        self.build_residual_branches = False

    @staticmethod
    def _conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1, dilation=1):
        result = nn.Sequential()
        result.add_module(
            "conv",
            nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size,
                stride=stride,
                padding=padding,
                groups=groups,
                bias=False,
                dilation=dilation,
            ),
        )
        result.add_module("bn", nn.BatchNorm2d(num_features=out_channels))
        return result

          
 
            import torch
from torch import nn, Tensor
import torch.nn.functional as F


class SEBlock(nn.Module):
    """
    Spatial Squeeze and Channel Excitation Block (cSE).

    Figure 1, Variant a from https://arxiv.org/abs/1808.08127v1
    """

    def __init__(self, in_channels: int, internal_neurons: int):
        super(SEBlock, self).__init__()
        self.down = nn.Conv2d(in_channels=in_channels, out_channels=internal_neurons, kernel_size=1, stride=1, bias=True)
        self.up = nn.Conv2d(in_channels=internal_neurons, out_channels=in_channels, kernel_size=1, stride=1, bias=True)
        self.input_channels = in_channels

    def forward(self, inputs: Tensor) -> Tensor:
        x = F.avg_pool2d(inputs, kernel_size=inputs.size(3))
        x = self.down(x)
        x = F.relu(x)
        x = self.up(x)
        x = torch.sigmoid(x)
        x = x.view(-1, self.input_channels, 1, 1)
        return inputs * x

          
@@ -11,212 +11,13 @@ Based on https://github.com/DingXiaoH/RepVGG
 
                             from typing import Union
                
 
                             import torch.nn as nn
                
 
                            -import numpy as np
                
 
                            -import torch
                
 
                            -import torch.nn.parallel
                
 
                            -import torch.optim
                
 
                            -import torch.utils.data
                
 
                            -import torch.utils.data.distributed
                
 
                            +
                
 
                            +from super_gradients.modules import RepVGGBlock, SEBlock
                
 
                             from super_gradients.training.models.sg_module import SgModule
                
 
                            -import torch.nn.functional as F
                
 
                             from super_gradients.training.utils.module_utils import fuse_repvgg_blocks_residual_branches
                
 
                             from super_gradients.training.utils.utils import get_param
                
 
                            -class SEBlock(nn.Module):
                
 
                            -    def __init__(self, input_channels, internal_neurons):
                
 
                            -        super(SEBlock, self).__init__()
                
 
                            -        self.down = nn.Conv2d(
                
 
                            -            in_channels=input_channels, out_channels=internal_neurons, kernel_size=1, stride=1, bias=True
                
 
                            -        )
                
 
                            -        self.up = nn.Conv2d(
                
 
                            -            in_channels=internal_neurons, out_channels=input_channels, kernel_size=1, stride=1, bias=True
                
 
                            -        )
                
 
                            -        self.input_channels = input_channels
                
 
                            -
                
 
                            -    def forward(self, inputs):
                
 
                            -        x = F.avg_pool2d(inputs, kernel_size=inputs.size(3))
                
 
                            -        x = self.down(x)
                
 
                            -        x = F.relu(x)
                
 
                            -        x = self.up(x)
                
 
                            -        x = torch.sigmoid(x)
                
 
                            -        x = x.view(-1, self.input_channels, 1, 1)
                
 
                            -        return inputs * x
                
 
                            -
                
 
                            -
                
 
                            -def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1, dilation=1):
                
 
                            -    result = nn.Sequential()
                
 
                            -    result.add_module(
                
 
                            -        "conv",
                
 
                            -        nn.Conv2d(
                
 
                            -            in_channels=in_channels,
                
 
                            -            out_channels=out_channels,
                
 
                            -            kernel_size=kernel_size,
                
 
                            -            stride=stride,
                
 
                            -            padding=padding,
                
 
                            -            groups=groups,
                
 
                            -            bias=False,
                
 
                            -            dilation=dilation,
                
 
                            -        ),
                
 
                            -    )
                
 
                            -    result.add_module("bn", nn.BatchNorm2d(num_features=out_channels))
                
 
                            -    return result
                
 
                            -
                
 
                            -
                
 
                            -class RepVGGBlock(nn.Module):
                
 
                            -    """
                
 
                            -    Repvgg block consists of three branches
                
 
                            -    3x3: a branch of a 3x3 convolution + batchnorm + relu
                
 
                            -    1x1: a branch of a 1x1 convolution + batchnorm + relu
                
 
                            -    no_conv_branch: a branch with only batchnorm which will only be used if input channel == output channel
                
 
                            -    (usually in all but the first block of each stage)
                
 
                            -    """
                
 
                            -
                
 
                            -    def __init__(
                
 
                            -        self,
                
 
                            -        in_channels,
                
 
                            -        out_channels,
                
 
                            -        kernel_size,
                
 
                            -        stride=1,
                
 
                            -        padding=0,
                
 
                            -        dilation=1,
                
 
                            -        groups=1,
                
 
                            -        build_residual_branches=True,
                
 
                            -        use_relu=True,
                
 
                            -        use_se=False,
                
 
                            -    ):
                
 
                            -
                
 
                            -        super(RepVGGBlock, self).__init__()
                
 
                            -
                
 
                            -        self.groups = groups
                
 
                            -        self.in_channels = in_channels
                
 
                            -
                
 
                            -        assert kernel_size == 3
                
 
                            -        assert padding == dilation
                
 
                            -
                
 
                            -        self.nonlinearity = nn.ReLU() if use_relu else nn.Identity()
                
 
                            -        self.se = nn.Identity() if not use_se else SEBlock(out_channels, internal_neurons=out_channels // 16)
                
 
                            -
                
 
                            -        self.no_conv_branch = (
                
 
                            -            nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
                
 
                            -        )
                
 
                            -        self.branch_3x3 = conv_bn(
                
 
                            -            in_channels=in_channels,
                
 
                            -            out_channels=out_channels,
                
 
                            -            dilation=dilation,
                
 
                            -            kernel_size=kernel_size,
                
 
                            -            stride=stride,
                
 
                            -            padding=padding,
                
 
                            -            groups=groups,
                
 
                            -        )
                
 
                            -        self.branch_1x1 = conv_bn(
                
 
                            -            in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=0, groups=groups
                
 
                            -        )
                
 
                            -
                
 
                            -        if not build_residual_branches:
                
 
                            -            self.fuse_block_residual_branches()
                
 
                            -        else:
                
 
                            -            self.build_residual_branches = True
                
 
                            -
                
 
                            -    def forward(self, inputs):
                
 
                            -        if not self.build_residual_branches:
                
 
                            -            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))
                
 
                            -
                
 
                            -        if self.no_conv_branch is None:
                
 
                            -            id_out = 0
                
 
                            -        else:
                
 
                            -            id_out = self.no_conv_branch(inputs)
                
 
                            -
                
 
                            -        return self.nonlinearity(self.se(self.branch_3x3(inputs) + self.branch_1x1(inputs) + id_out))
                
 
                            -
                
 
                            -    def _get_equivalent_kernel_bias(self):
                
 
                            -        """
                
 
                            -        Fuses the 3x3, 1x1 and identity branches into a single 3x3 conv layer
                
 
                            -        """
                
 
                            -        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.branch_3x3)
                
 
                            -        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.branch_1x1)
                
 
                            -        kernelid, biasid = self._fuse_bn_tensor(self.no_conv_branch)
                
 
                            -        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
                
 
                            -
                
 
                            -    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
                
 
                            -        """
                
 
                            -        padding the 1x1 convolution weights with zeros to be able to fuse the 3x3 conv layer with the 1x1
                
 
                            -        :param kernel1x1: weights of the 1x1 convolution
                
 
                            -        :type kernel1x1:
                
 
                            -        :return: padded 1x1 weights
                
 
                            -        :rtype:
                
 
                            -        """
                
 
                            -        if kernel1x1 is None:
                
 
                            -            return 0
                
 
                            -        else:
                
 
                            -            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
                
 
                            -
                
 
                            -    def _fuse_bn_tensor(self, branch):
                
 
                            -        """
                
 
                            -        Fusing of the batchnorm into the conv layer.
                
 
                            -        If the branch is the identity branch (no conv) the kernel will simply be eye.
                
 
                            -        :param branch:
                
 
                            -        :type branch:
                
 
                            -        :return:
                
 
                            -        :rtype:
                
 
                            -        """
                
 
                            -        if branch is None:
                
 
                            -            return 0, 0
                
 
                            -        if isinstance(branch, nn.Sequential):
                
 
                            -            kernel = branch.conv.weight
                
 
                            -            running_mean = branch.bn.running_mean
                
 
                            -            running_var = branch.bn.running_var
                
 
                            -            gamma = branch.bn.weight
                
 
                            -            beta = branch.bn.bias
                
 
                            -            eps = branch.bn.eps
                
 
                            -        else:
                
 
                            -            assert isinstance(branch, nn.BatchNorm2d)
                
 
                            -            if not hasattr(self, "id_tensor"):
                
 
                            -                input_dim = self.in_channels // self.groups
                
 
                            -                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32)
                
 
                            -                for i in range(self.in_channels):
                
 
                            -                    kernel_value[i, i % input_dim, 1, 1] = 1
                
 
                            -                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
                
 
                            -            kernel = self.id_tensor
                
 
                            -            running_mean = branch.running_mean
                
 
                            -            running_var = branch.running_var
                
 
                            -            gamma = branch.weight
                
 
                            -            beta = branch.bias
                
 
                            -            eps = branch.eps
                
 
                            -        std = (running_var + eps).sqrt()
                
 
                            -        t = (gamma / std).reshape(-1, 1, 1, 1)
                
 
                            -        return kernel * t, beta - running_mean * gamma / std
                
 
                            -
                
 
                            -    def fuse_block_residual_branches(self):
                
 
                            -        """
                
 
                            -        converts a repvgg block from training model (with branches) to deployment mode (vgg like model)
                
 
                            -        :return:
                
 
                            -        :rtype:
                
 
                            -        """
                
 
                            -        if hasattr(self, "build_residual_branches") and not self.build_residual_branches:
                
 
                            -            return
                
 
                            -        kernel, bias = self._get_equivalent_kernel_bias()
                
 
                            -        self.rbr_reparam = nn.Conv2d(
                
 
                            -            in_channels=self.branch_3x3.conv.in_channels,
                
 
                            -            out_channels=self.branch_3x3.conv.out_channels,
                
 
                            -            kernel_size=self.branch_3x3.conv.kernel_size,
                
 
                            -            stride=self.branch_3x3.conv.stride,
                
 
                            -            padding=self.branch_3x3.conv.padding,
                
 
                            -            dilation=self.branch_3x3.conv.dilation,
                
 
                            -            groups=self.branch_3x3.conv.groups,
                
 
                            -            bias=True,
                
 
                            -        )
                
 
                            -        self.rbr_reparam.weight.data = kernel
                
 
                            -        self.rbr_reparam.bias.data = bias
                
 
                            -        for para in self.parameters():
                
 
                            -            para.detach_()
                
 
                            -        self.__delattr__("branch_3x3")
                
 
                            -        self.__delattr__("branch_1x1")
                
 
                            -        if hasattr(self, "no_conv_branch"):
                
 
                            -            self.__delattr__("no_conv_branch")
                
 
                            -        self.build_residual_branches = False
                
 
                            -
                
 
                            -
                
 
                             class RepVGG(SgModule):
                
 
                                 def __init__(
                
 
                                     self,
                
@@ -253,11 +54,12 @@ class RepVGG(SgModule):
 
                                     self.stem = RepVGGBlock(
                
 
                                         in_channels=in_channels,
                
 
                                         out_channels=self.in_planes,
                
 
                            -            kernel_size=3,
                
 
                                         stride=2,
                
 
                            -            padding=1,
                
 
                                         build_residual_branches=build_residual_branches,
                
 
                            -            use_se=self.use_se,
                
 
                            +            activation_type=nn.ReLU,
                
 
                            +            activation_kwargs=dict(inplace=True),
                
 
                            +            se_type=SEBlock if self.use_se else nn.Identity,
                
 
                            +            se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
                
 
                                     )
                
 
                                     self.cur_layer_idx = 1
                
 
                                     self.stage1 = self._make_stage(int(64 * width_multiplier[0]), struct[0], stride=2)
                
@@ -282,12 +84,13 @@ class RepVGG(SgModule):
 
                                             RepVGGBlock(
                
 
                                                 in_channels=self.in_planes,
                
 
                                                 out_channels=planes,
                
 
                            -                    kernel_size=3,
                
 
                                                 stride=stride,
                
 
                            -                    padding=1,
                
 
                                                 groups=1,
                
 
                                                 build_residual_branches=self.build_residual_branches,
                
 
                            -                    use_se=self.use_se,
                
 
                            +                    activation_type=nn.ReLU,
                
 
                            +                    activation_kwargs=dict(inplace=True),
                
 
                            +                    se_type=SEBlock if self.use_se else nn.Identity,
                
 
                            +                    se_kwargs=dict(in_channels=self.in_planes, internal_neurons=self.in_planes // 16) if self.use_se else None,
                
 
                                             )
                
 
                                         )
                
 
                                         self.in_planes = planes
                
@@ -312,10 +115,9 @@ class RepVGG(SgModule):
 
                                 def train(self, mode: bool = True):
                
 
                            -        assert not mode or self.build_residual_branches, (
                
 
                            -            "Trying to train a model without residual branches, "
                
 
                            -            "set arch_params.build_residual_branches to True and retrain the model"
                
 
                            -        )
                
 
                            +        assert (
                
 
                            +            not mode or self.build_residual_branches
                
 
                            +        ), "Trying to train a model without residual branches, set arch_params.build_residual_branches to True and retrain the model"
                
 
                                     super(RepVGG, self).train(mode=mode)
                
 
                                 def replace_head(self, new_num_classes=None, new_head=None):
                
@@ -3,7 +3,8 @@ import torch.nn as nn
 
                             import torch.nn.functional as F
                
 
                             from typing import Union, List, Tuple
                
 
                            -from super_gradients.training.utils.module_utils import ConvBNReLU, make_upsample_module
                
 
                            +from super_gradients.modules import ConvBNReLU
                
 
                            +from super_gradients.training.utils.module_utils import make_upsample_module
                
 
                             from super_gradients.common import UpsampleMode
                
 
                             from super_gradients.training.models.segmentation_models.stdc import SegmentationHead, AbstractSTDCBackbone,\
                
 
                                 STDC1Backbone, STDC2Backbone
                
@@ -8,7 +8,7 @@ import torch
 
                             import torch.nn as nn
                
 
                             from super_gradients.training.models import SgModule
                
 
                             from super_gradients.training.utils import HpmStruct, get_param
                
 
                            -from super_gradients.training.utils.module_utils import ConvBNReLU
                
 
                            +from super_gradients.modules import ConvBNReLU
                
 
                             DEFAULT_REGSEG48_BACKBONE_PARAMS = {
                
 
                                 "stages": [
                
@@ -11,7 +11,7 @@ from super_gradients.common.decorators.factory_decorator import resolve_param
 
                             from super_gradients.common.factories.base_factory import BaseFactory
                
 
                             from super_gradients.training.models import SgModule
                
 
                             from super_gradients.training.utils import get_param, HpmStruct
                
 
                            -from super_gradients.training.utils.module_utils import ConvBNReLU
                
 
                            +from super_gradients.modules import ConvBNReLU
                
 
                             from typing import Union, List
                
 
                             from abc import ABC, abstractmethod
                
@@ -1,6 +1,6 @@
 
                             from collections import OrderedDict
                
 
                             import copy
                
 
                            -from typing import List, Union, Tuple, Optional
                
 
                            +from typing import List, Union, Optional
                
 
                             import torch
                
 
                             from torch import nn
                
@@ -45,7 +45,7 @@ class MultiOutputModule(nn.Module):
 
                                     """
                
 
                                     super().__init__()
                
 
                                     self.output_paths = output_paths
                
 
                            -        self._modules['0'] = module
                
 
                            +        self._modules["0"] = module
                
 
                                     self._outputs_lists = {}
                
 
                                     for path in output_paths:
                
@@ -61,7 +61,7 @@ class MultiOutputModule(nn.Module):
 
                                 def forward(self, x) -> list:
                
 
                                     self._outputs_lists[x.device] = []
                
 
                            -        self._modules['0'](x)
                
 
                            +        self._modules["0"](x)
                
 
                                     return self._outputs_lists[x.device]
                
 
                                 def _get_recursive(self, module: nn.Module, path) -> nn.Module:
                
@@ -100,10 +100,7 @@ class MultiOutputModule(nn.Module):
 
                                 def _slice_odict(self, odict: OrderedDict, start: int, end: int):
                
 
                                     """Slice an OrderedDict in the same logic list,tuple... are sliced"""
                
 
                            -        return OrderedDict([
                
 
                            -            (k, v) for (k, v) in odict.items()
                
 
                            -            if k in list(odict.keys())[start:end]
                
 
                            -        ])
                
 
                            +        return OrderedDict([(k, v) for (k, v) in odict.items() if k in list(odict.keys())[start:end]])
                
 
                             def _replace_activations_recursive(module: nn.Module, new_activation: nn.Module, activations_to_replace: List[type]):
                
@@ -125,79 +122,28 @@ def replace_activations(module: nn.Module, new_activation: nn.Module, activation
 
                                 :param activations_to_replace:  types of activations to replace, each must be a subclass of nn.Module
                
 
                                 """
                
 
                                 # check arguments once before the recursion
                
 
                            -    assert isinstance(new_activation, nn.Module), 'new_activation should be nn.Module'
                
 
                            -    assert all([isinstance(t, type) and issubclass(t, nn.Module) for t in activations_to_replace]), \
                
 
                            -        'activations_to_replace should be types that are subclasses of nn.Module'
                
 
                            +    assert isinstance(new_activation, nn.Module), "new_activation should be nn.Module"
                
 
                            +    assert all(
                
 
                            +        [isinstance(t, type) and issubclass(t, nn.Module) for t in activations_to_replace]
                
 
                            +    ), "activations_to_replace should be types that are subclasses of nn.Module"
                
 
                                 # do the replacement
                
 
                                 _replace_activations_recursive(module, new_activation, activations_to_replace)
                
 
                             def fuse_repvgg_blocks_residual_branches(model: nn.Module):
                
 
                            -    '''
                
 
                            +    """
                
 
                                 Call fuse_block_residual_branches for all repvgg blocks in the model
                
 
                                 :param model: torch.nn.Module with repvgg blocks. Doesn't have to be entirely consists of repvgg.
                
 
                                 :type model: torch.nn.Module
                
 
                            -    '''
                
 
                            +    """
                
 
                                 assert not model.training, "To fuse RepVGG block residual branches, model must be on eval mode"
                
 
                                 for module in model.modules():
                
 
                            -        if hasattr(module, 'fuse_block_residual_branches'):
                
 
                            +        if hasattr(module, "fuse_block_residual_branches"):
                
 
                                         module.fuse_block_residual_branches()
                
 
                                 model.build_residual_branches = False
                
 
                            -class ConvBNReLU(nn.Module):
                
 
                            -    """
                
 
                            -    Class for Convolution2d-Batchnorm2d-Relu layer. Default behaviour is Conv-BN-Relu. To exclude Batchnorm module use
                
 
                            -        `use_normalization=False`, to exclude Relu activation use `use_activation=False`.
                
 
                            -    For convolution arguments documentation see `nn.Conv2d`.
                
 
                            -    For batchnorm arguments documentation see `nn.BatchNorm2d`.
                
 
                            -    For relu arguments documentation see `nn.Relu`.
                
 
                            -    """
                
 
                            -
                
 
                            -    def __init__(self,
                
 
                            -                 in_channels: int,
                
 
                            -                 out_channels: int,
                
 
                            -                 kernel_size: Union[int, Tuple[int, int]],
                
 
                            -                 stride: Union[int, Tuple[int, int]] = 1,
                
 
                            -                 padding: Union[int, Tuple[int, int]] = 0,
                
 
                            -                 dilation: Union[int, Tuple[int, int]] = 1,
                
 
                            -                 groups: int = 1,
                
 
                            -                 bias: bool = True,
                
 
                            -                 padding_mode: str = 'zeros',
                
 
                            -                 use_normalization: bool = True,
                
 
                            -                 eps: float = 1e-5,
                
 
                            -                 momentum: float = 0.1,
                
 
                            -                 affine: bool = True,
                
 
                            -                 track_running_stats: bool = True,
                
 
                            -                 device=None,
                
 
                            -                 dtype=None,
                
 
                            -                 use_activation: bool = True,
                
 
                            -                 inplace: bool = False):
                
 
                            -
                
 
                            -        super(ConvBNReLU, self).__init__()
                
 
                            -        self.seq = nn.Sequential()
                
 
                            -        self.seq.add_module("conv", nn.Conv2d(in_channels,
                
 
                            -                                              out_channels,
                
 
                            -                                              kernel_size=kernel_size,
                
 
                            -                                              stride=stride,
                
 
                            -                                              padding=padding,
                
 
                            -                                              dilation=dilation,
                
 
                            -                                              groups=groups,
                
 
                            -                                              bias=bias,
                
 
                            -                                              padding_mode=padding_mode))
                
 
                            -
                
 
                            -        if use_normalization:
                
 
                            -            self.seq.add_module("bn", nn.BatchNorm2d(out_channels, eps=eps, momentum=momentum, affine=affine,
                
 
                            -                                                     track_running_stats=track_running_stats, device=device,
                
 
                            -                                                     dtype=dtype))
                
 
                            -        if use_activation:
                
 
                            -            self.seq.add_module("relu", nn.ReLU(inplace=inplace))
                
 
                            -
                
 
                            -    def forward(self, x):
                
 
                            -        return self.seq(x)
                
 
                            -
                
 
                            -
                
 
                             class NormalizationAdapter(torch.nn.Module):
                
 
                                 """
                
 
                                 Denormalizes input by mean_original, std_original, then normalizes by mean_required, std_required.
                
@@ -208,6 +154,7 @@ class NormalizationAdapter(torch.nn.Module):
 
                                  number of input channels.
                
 
                                 """
                
 
                            +
                
 
                                 def __init__(self, mean_original, std_original, mean_required, std_required):
                
 
                                     super(NormalizationAdapter, self).__init__()
                
 
                                     mean_original = torch.tensor(mean_original).unsqueeze(-1).unsqueeze(-1)
                
@@ -223,9 +170,7 @@ class NormalizationAdapter(torch.nn.Module):
 
                                     return x
                
 
                            -def make_upsample_module(scale_factor: int,
                
 
                            -                         upsample_mode: Union[str, UpsampleMode],
                
 
                            -                         align_corners: Optional[bool] = None):
                
 
                            +def make_upsample_module(scale_factor: int, upsample_mode: Union[str, UpsampleMode], align_corners: Optional[bool] = None):
                
 
                                 """
                
 
                                 Factory method for creating upsampling modules.
                
 
                                 :param scale_factor: upsample scale factor
                
@@ -1,7 +1,7 @@
 
                             import torch
                
 
                             import unittest
                
 
                             import torch.nn as nn
                
 
                            -from super_gradients.training.utils.module_utils import ConvBNReLU
                
 
                            +from super_gradients.modules import ConvBNReLU
                
 
                             class TestConvBnRelu(unittest.TestCase):