05Nvidia剪枝方案介绍-526互联

Nvidia剪枝方案介绍

目前大多数的剪枝研究处于以下两个方面

绝大多数剪枝是非结构化的，属于细粒度稀疏。而细粒度稀疏其实没有那么好的加速效果
Coarse-grained sparsity的稀疏效果有限

（"Coarse-grained sparsity"是一种稀疏性类型，它指的是在较大的数据块或数据结构中存在稀疏性，而不是在单个元素级别。在深度学习和神经网络中，这通常意味着在层级别或通道级别进行稀疏化，而不是在单个权重或神经元级别。

例如，对于卷积神经网络，粗粒度稀疏性可能意味着整个过滤器或通道被置零或被剪枝，而不是单个权重。这种稀疏性类型的一个优点是，它可以更容易地利用硬件加速器的并行性，因为整个数据块可以一次性地被加载、处理或跳过。

相反，"fine-grained sparsity"则是指在单个元素级别存在稀疏性，例如单个权重或神经元被置零或被剪枝。这种稀疏性类型可能更难以优化，因为它可能需要更复杂的索引和数据管理策略。

"coarse-grained sparsity"是一种在更大的数据结构级别实现稀疏性的策略，它可以更容易地与硬件优化相结合。）

面临的挑战

精度丢失
没有一个通用的剪枝方案去针对不同的网络
Lack of speedup（由于剪完之后结构发生了改变，可能无法使用矩阵加速，可能无法利用内存加速，存储开销变大）

下面是一个demo，流程是加载预训练模型 -> 测试预训练模型 -> 剪枝 ->测试剪枝后的模型 -> 再训练剪枝后的模型 -> 测试再训练后的模型 -> 保存剪枝和再训练后的模型

torch.manual_seed(42)
get_model("./model.pt")
# get_model("None")
print("-------orig---------")
test()
print(model[2].state_dict())
ASP.prune_trained_model(model, optimizer)
print("-------pruned---------")
test()
print(model[2].state_dict())
train()
print("-------retrain---------")
test()
print(model[2].state_dict())
torch.save(model, "./model_sparse.pt")

构建加载模型的函数get_model()

#如果有则直接加载模型和优化器，没有则构建一个简单的模型并train一下然后保存下来
def get_model(f):
    global model, optimizer
    if os.path.exists(f):
        model = torch.load(f).cuda()
        optimizer = optim.Adam(model.parameters(), lr=0.01)
    else:
        model = nn.Sequential(
            nn.Linear(8, 16),
            nn.PReLU(),
            nn.Linear(16, 8),
        ).cuda()
        optimizer = optim.Adam(model.parameters(), lr=0.01)
        train()
        torch.save(model, f)

ASP( Automatic Sparsity Pruning)复现，该方法是Nvidia在2020年提出并首次引入Nvidia的Ampere架构中。在这种方法中，权重的重要性是通过一种称为 "mask" 的机制来确定的。这些 mask 是在训练过程中学习的，并且在训练结束时，权重被乘以相应的 mask。这样，不重要的权重（即，对应于 mask 中的零的权重）就被剪枝掉了。

class ASP:
    model = None
    verbosity = 0
    optimizer = None
    sparse_parameters = []
    calculate_mask = None

    @classmethod
    def init_model_for_pruning(
        cls,
        model,
        mask_calculator="m4n2_1d",
        verbosity=3,
        whitelist=[torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d],
        custom_layer_dict={},
    ):
        assert cls.model is None, "ASP has been initialized already."
        cls.model = model
        cls.verbosity = verbosity

        if isinstance(mask_calculator, str):
            def create_mask_from_pattern(param):
                return create_mask(param, mask_calculator).bool()

            cls.calculate_mask = create_mask_from_pattern

        # function to extract variables that will be sparsified.
        # idea is that you will add one of these functions for each module type that can be sparsified.

        sparse_parameter_list = {
            torch.nn.Linear: ["weight"],
            torch.nn.Conv1d: ["weight"],
            torch.nn.Conv2d: ["weight"],
        }
        if (
            custom_layer_dict
        ):  # Update default list to include user supplied custom (layer type : parameter tensor), make sure this tensor type is something ASP knows how to prune
            sparse_parameter_list.update(custom_layer_dict)
            whitelist += list(custom_layer_dict.keys())

        for module_type in whitelist:
            assert module_type in sparse_parameter_list, (
                "Module %s :: Don't know how to sparsify module." % module.dtype()
            )

        # find all sparse modules, extract sparse parameters and decorate
        def add_sparse_attributes(module_name, module):
            sparse_parameters = sparse_parameter_list[type(module)]
            for p_name, p in module.named_parameters():
                if p_name in sparse_parameters and p.requires_grad:
                    # check for NVIDIA's TC compatibility: we check along the horizontal direction
                    if p.dtype == torch.float32 and (
                        (p.size()[0] % 8) != 0 or (p.size()[1] % 16) != 0
                    ):  # User defines FP32 and APEX internally uses FP16 math
                        print(
                            "[ASP] Auto skipping pruning %s::%s of size=%s and type=%s for sparsity"
                            % (module_name, p_name, str(p.size()), str(p.dtype))
                        )
                        continue
                    if p.dtype == torch.float16 and (
                        (p.size()[0] % 8) != 0 or (p.size()[1] % 16) != 0
                    ):  # For Conv2d dim= K x CRS; we prune along C
                        print(
                            "[ASP] Auto skipping pruning %s::%s of size=%s and type=%s for sparsity"
                            % (module_name, p_name, str(p.size()), str(p.dtype))
                        )
                        continue

                    if cls.verbosity >= 3:
                        print(
                            "[ASP] Sparsifying %s::%s of size=%s and type=%s for sparsity"
                            % (module_name, p_name, str(p.size()), str(p.dtype))
                        )

                    mask = torch.ones_like(p).bool()
                    buffname = p_name.split(".")[-1]  # buffer names cannot contain "."
                    module.register_buffer("__%s_mma_mask" % buffname, mask)
                    cls.sparse_parameters.append(
                        (module_name, module, p_name, p, mask)
                    )
                else:
                    if cls.verbosity >= 3:
                        print(
                            "[ASP] Not sparsifying %s::%s of size=%s and type=%s"
                            % (module_name, p_name, str(p.size()), str(p.dtype))
                        )

        for name, sparse_module in eligible_modules(
            model, tuple(whitelist)
        ):
            add_sparse_attributes(name, sparse_module)

    @classmethod
    def init_optimizer_for_pruning(cls, optimizer):
        assert cls.optimizer is None, "ASP has initialized optimizer already."
        assert (
            cls.calculate_mask is not None
        ), "Called ASP.init_optimizer_for_pruning before ASP.init_model_for_pruning."

        # store pointer to original optimizer step method
        cls.optimizer = optimizer
        cls.optimizer.__step = optimizer.step

        def __step(opt_self, *args, **kwargs):
            # prune gradients before step method
            with torch.no_grad():
                for (
                    module_name,
                    module,
                    p_name,
                    p,
                    mask,
                ) in cls.sparse_parameters:
                    if p.grad is not None:  # thx pjudd
                        p.grad.mul_(mask)
            # call original optimizer step method
            rval = opt_self.__step(*args, **kwargs)
            # prune parameters after step method
            with torch.no_grad():
                for (
                    module_name,
                    module,
                    p_name,
                    p,
                    mask,
                ) in cls.sparse_parameters:
                    p.mul_(mask)
            return rval

        cls.optimizer.step = types.MethodType(__step, cls.optimizer)

    @classmethod
    def compute_sparse_masks(cls): #!aaaa
        with torch.no_grad():
            for module_name, module, p_name, p, mask in cls.sparse_parameters:
                mask.set_(cls.calculate_mask(p)) # torch.Size([8, 16]) # mask = cls.calculate_mask(p) # in place op
                p.mul_(
                    mask
                )  # in-place multiplication, so pruned weights are 0-values, hence checkpoint will have 0s for pruned weights

    @classmethod
    def prune_trained_model(cls, model, optimizer):
        # add mask buffers to model (init_model_for_pruning), augment optimizer (init_optimizer_for_pruning) and compute masks (compute_sparse_masks)
        cls.init_model_for_pruning(
            model,
            mask_calculator="m4n2_1d",
            verbosity=2,
            whitelist=[torch.nn.Linear, torch.nn.Conv2d],
        )
        cls.init_optimizer_for_pruning(optimizer)
        cls.compute_sparse_masks()

构建mask

def create_mask(tensor, pattern="m4n2_1d", density=0.5): #! 0
    # Reshape tensor and mask.
    shape = tensor.shape
    ttype = tensor.type()
    t = tensor.float().contiguous()

    # len(shape) == 2:
    t = t.view(shape[0], shape[1])
    func = getattr(sys.modules[__name__], pattern, None) # getattr() asks for the name of a thing we're looking for (like a function or an attribute in a module), and if it finds it, we can use it later in our code.
    mask = func(t, density) # func here is m4n2_1d func
    return mask.view(shape).type(ttype)
param = torch.randn(8, 16).to("cuda:0")

    def create_mask_from_pattern(param):
        return create_mask(param, "m4n2_1d").bool() #工厂模式

    mask = create_mask_from_pattern(param)

首先是取到权重矩阵，然后分割成每4个一组，然后乘以01的全排列（m个位置里选出n个1），假设是4，则有6种排列，那么结果是n*6的矩阵，然后在每一个维度上取一个最大值

#从m个位置里选出n个位置为1并生成所有的排列
def compute_valid_1d_patterns(m, n): 
    patterns = torch.zeros(m) # [0,0,0,0]
    patterns[:n] = 1
    valid_patterns = torch.Tensor(list(set(permutations(patterns.tolist()))))
    return valid_patterns
def mn_1d_best(matrix, m, n): 
    patterns = compute_valid_1d_patterns(m, n).cuda()
    #首先把权重矩阵复制出来，全部填上1，并更改为4个一组
    mask = torch.cuda.IntTensor(matrix.shape).fill_(1).view(-1, m)
    mat, shape = reshape_1d(matrix, m) # matrix: [8, 16] ==>  mat[32, 4]
    #做矩阵乘法，并对每一行取一个最大值的索引
    #在PyTorch中，torch.argmax()函数返回输入张量中沿指定维度最大值的索引。dim参数就是用来指定这个维度的。
    pmax = torch.argmax(torch.matmul(mat.abs(), patterns.t()), dim=1) # 32x4@4x6=32x6
    #pmax是索引，根据索引把对应01排列取出来
    mask[:] = patterns[pmax[:]]
    #然后将mask还原成matrix的形状
    mask = mask.view(matrix.shape)
    return mask

方案

Nvidia

方案nvidia 05

nvidia communicate nvidia-smi解决方案

management virtual nvidia memory

nvidia-cuda-toolkit

nvidia-driver nvidia driver ubuntu

nvidia-teslam