import copy
import itertools
import operator
from functools import reduce
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch._dynamo.utils import fake_mode_from_tensors
from torch.fx.experimental.optimization import (
    matches_module_pattern,
    replace_node_module,
)
from torch.fx.experimental.symbolic_shapes import guard_int
from torch.fx.passes.shape_prop import ShapeProp
from torch.nn.modules.utils import _pair
from . import config

from .fx_utils import matches_module_function_pattern


class UnaryAttr:
    def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
        self.op_name = op_name
        self.scalars_attr = scalars_attr if scalars_attr else []
        self.algorithm_attr = algorithm_attr if algorithm_attr else ""
        super().__init__()

    def __call__(self, unary_module: nn.Module):
        if type(unary_module) is nn.ReLU6:
            unary_module = nn.Hardtanh(min_val=0, max_val=6)
        assert all(hasattr(unary_module, item) for item in self.scalars_attr)
        scalars = [getattr(unary_module, item) for item in self.scalars_attr]

        algorithm = ""
        if self.algorithm_attr:
            assert hasattr(unary_module, self.algorithm_attr)
            algorithm = getattr(unary_module, self.algorithm_attr)

        return self.op_name, scalars, algorithm


def is_bfloat16_module(m):
    weight_is_bf16 = m.weight.dtype == torch.bfloat16
    bias_is_bf16 = m.bias is None or m.bias.dtype == torch.bfloat16
    return weight_is_bf16 and bias_is_bf16


def is_group_depthwise_conv_transpose(m):
    return (
        type(m) in [nn.ConvTranspose2d] and m.groups > 1 and m.groups == m.in_channels
    )


def check_node_kind(current_node, modules, node_kind):
    if not isinstance(current_node, torch.fx.Node):
        return False
    if current_node.op != "call_module":
        return False
    if not isinstance(current_node.target, str):
        return False
    if current_node.target not in modules:
        return False
    if type(modules[current_node.target]) is not node_kind:
        return False
    return True


def check_node_is_binary(node):
    return (
        (node.op == "call_function" and node.target in [torch.add, torch.sub])
        or (
            node.op == "call_function"
            and node.target
            in [operator.add, operator.iadd, operator.sub, operator.isub]
        )
        or (node.op == "call_method" and node.target in ["add", "add_", "sub", "sub_"])
    )


def check_binary_op_kwargs_is_default(node):
    # For binary op, we hope the kwargs values are the default value:
    # torch.sub(add)(input, other, *, alpha=1, out=None).
    if len(node.args) > 2:
        return False
    if len(node.kwargs) > 0:
        if "out" in node.kwargs and node.kwargs["out"] is not None:
            return False
        if "alpha" in node.kwargs and node.kwargs["alpha"] != 1.0:
            return False
    return True


class ConvUnary2d(nn.Conv2d):
    def __init__(
        self,
        conv: nn.Module,
        unary: Optional[nn.Module],
        input_size: list,
    ):
        super().__init__(
            conv.in_channels,
            conv.out_channels,
            conv.kernel_size,
            conv.stride,
            conv.padding,
            conv.dilation,
            conv.groups,
            conv.bias is not None,
            conv.padding_mode,
            conv.weight.device,
            conv.weight.dtype,
        )
        self._update_module_params(conv, unary, input_size)

    def _update_module_params(self, conv, unary, input_size):
        self.__dict__ = copy.deepcopy(conv.__dict__)
        self.attr = "none"
        self.scalars = []
        self.algorithm = ""
        if unary is not None:
            self.attr, self.scalars, self.algorithm = unary_modules_map[
                unary.__class__
            ](unary)
        self.weight = torch.nn.Parameter(
            torch._C._nn.mkldnn_reorder_conv2d_weight(
                self.weight.to_mkldnn(),
                self.padding,
                self.stride,
                self.dilation,
                self.groups,
                tuple(guard_int(x) for x in input_size),
            ),
            requires_grad=self.weight.requires_grad,
        )

    def _conv_forward(self, input, weight, bias):
        if self.padding_mode != "zeros":
            return torch.ops.mkldnn._convolution_pointwise(
                F.pad(
                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
                ),
                weight,
                bias,
                _pair(0),
                self.stride,
                self.dilation,
                self.groups,
                self.attr,
                self.scalars,
                self.algorithm,
            )
        return torch.ops.mkldnn._convolution_pointwise(
            input,
            weight,
            bias,
            self.padding,
            self.stride,
            self.dilation,
            self.groups,
            self.attr,
            self.scalars,
            self.algorithm,
        )

    def forward(self, input):
        return self._conv_forward(input, self.weight, self.bias)


class ConvBinary2d(nn.Conv2d):
    def __init__(
        self,
        conv: nn.Module,
        binary_op_name: str,
        input_size: list,
    ):
        super().__init__(
            conv.in_channels,
            conv.out_channels,
            conv.kernel_size,
            conv.stride,
            conv.padding,
            conv.dilation,
            conv.groups,
            conv.bias is not None,
            conv.padding_mode,
            conv.weight.device,
            conv.weight.dtype,
        )
        self._update_module_params(conv, binary_op_name, input_size)

    def _update_module_params(self, conv, binary_op_name, input_size):
        self.__dict__ = copy.deepcopy(conv.__dict__)
        self.binary_attr = binary_op_name
        self.binary_alpha = None
        self.unary_attr = None
        self.unary_scalars = []
        self.unary_algorithm = None
        self.weight = torch.nn.Parameter(
            torch._C._nn.mkldnn_reorder_conv2d_weight(
                self.weight.to_mkldnn(),
                self.padding,
                self.stride,
                self.dilation,
                self.groups,
                tuple(guard_int(x) for x in input_size),
            ),
            requires_grad=self.weight.requires_grad,
        )

    def _update_unary_params(self, unary):
        self.unary_attr, self.unary_scalars, self.unary_algorithm = unary_modules_map[
            unary.__class__
        ](unary)

    def _conv_forward(self, input, other, weight, bias):
        if self.padding_mode != "zeros":
            return torch.ops.mkldnn._convolution_pointwise(
                F.pad(
                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
                ),
                other,
                weight,
                bias,
                _pair(0),
                self.stride,
                self.dilation,
                self.groups,
                self.binary_attr,
                self.binary_alpha,
                self.unary_attr,
                self.unary_scalars,
                self.unary_algorithm,
            )
        return torch.ops.mkldnn._convolution_pointwise(
            input,
            other,
            weight,
            bias,
            self.padding,
            self.stride,
            self.dilation,
            self.groups,
            self.binary_attr,
            self.binary_alpha,
            self.unary_attr,
            self.unary_scalars,
            self.unary_algorithm,
        )

    def forward(self, input, other):
        return self._conv_forward(input, other, self.weight, self.bias)


class PackedLinear(nn.Linear):
    def __init__(self, linear: nn.Module, input_size: list):
        super().__init__(
            linear.in_features,
            linear.out_features,
            linear.bias is not None,
            linear.weight.device,
            linear.weight.dtype,
        )
        self._update_module_params(linear, input_size)

    def _update_module_params(self, linear, input_size):
        self.__dict__ = copy.deepcopy(linear.__dict__)
        self.batch_size = reduce(lambda x, y: x * y, input_size[:-1])
        self.packed_weight = torch.nn.Parameter(
            torch.ops.mkl._mkl_reorder_linear_weight(
                self.weight.to_mkldnn(), self.batch_size
            ),
            requires_grad=self.weight.requires_grad,
        )

    def forward(self, input):
        y = torch.ops.mkl._mkl_linear(
            input, self.packed_weight, self.weight, self.bias, self.batch_size
        )
        return y


class LinearUnary(nn.Linear):
    def __init__(
        self,
        linear: nn.Module,
        unary: nn.Module,
    ):
        super().__init__(
            linear.in_features,
            linear.out_features,
            linear.bias is not None,
            linear.weight.device,
            linear.weight.dtype,
        )
        self._update_module_params(linear, unary)

    def _update_module_params(self, linear, unary):
        self.__dict__ = copy.deepcopy(linear.__dict__)
        self.attr, self.scalars, self.algorithm = unary_modules_map[unary.__class__](
            unary
        )

    def forward(self, input):
        y = torch.ops.mkldnn._linear_pointwise(
            input, self.weight, self.bias, self.attr, self.scalars, self.algorithm
        )
        return y


class LinearBinary(nn.Linear):
    def __init__(self, linear: nn.Module, binary_op_name: str):
        super().__init__(
            linear.in_features,
            linear.out_features,
            linear.bias is not None,
            linear.weight.device,
            linear.weight.dtype,
        )
        self._update_module_params(linear, binary_op_name)

    def _update_module_params(self, linear, binary_op_name):
        self.__dict__ = copy.deepcopy(linear.__dict__)

        self.attr = binary_op_name

    def forward(self, input, other):
        y = torch.ops.mkldnn._linear_pointwise(
            input, other, self.weight, self.bias, self.attr
        )
        return y


class ConvTransposeUnary2d(nn.ConvTranspose2d):
    def __init__(
        self,
        conv_transpose: nn.Module,
        unary: Optional[nn.Module],
        input_size: list,
    ):
        super().__init__(
            conv_transpose.in_channels,
            conv_transpose.out_channels,
            conv_transpose.kernel_size,
            conv_transpose.stride,
            conv_transpose.padding,
            conv_transpose.output_padding,
            conv_transpose.groups,
            conv_transpose.bias is not None,
            conv_transpose.dilation,
            conv_transpose.padding_mode,
            conv_transpose.weight.device,
            conv_transpose.weight.dtype,
        )
        self._update_module_params(conv_transpose, unary, input_size)

    def _update_module_params(self, conv_transpose, unary, input_size):
        self.__dict__ = copy.deepcopy(conv_transpose.__dict__)
        self.attr, self.scalars, self.algorithm = (
            unary_modules_map[unary.__class__](unary) if unary else ("none", [], "")
        )
        packed_weight = torch.ops.mkldnn._reorder_convolution_transpose_weight(
            self.weight.to_mkldnn(),
            self.padding,
            self.output_padding,
            self.stride,
            self.dilation,
            self.groups,
            input_size,
        )
        self.weight = torch.nn.Parameter(
            packed_weight,
            requires_grad=self.weight.requires_grad,
        )

    def _conv_transpose_forward(self, input, weight, bias):
        if self.padding_mode != "zeros":
            return torch.ops.mkldnn._convolution_transpose_pointwise(
                F.pad(
                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
                ),
                weight,
                bias,
                _pair(0),
                self.output_padding,
                self.stride,
                self.dilation,
                self.groups,
                self.attr,
                self.scalars,
                self.algorithm,
            )
        return torch.ops.mkldnn._convolution_transpose_pointwise(
            input,
            weight,
            bias,
            self.padding,
            self.output_padding,
            self.stride,
            self.dilation,
            self.groups,
            self.attr,
            self.scalars,
            self.algorithm,
        )

    def forward(self, input):
        return self._conv_transpose_forward(input, self.weight, self.bias)


def packed_conv_eval(conv: nn.Module, input_size: list):
    assert not (conv.training), "Fusion only for eval!"
    return ConvUnary2d(
        conv,
        None,
        input_size,
    )


def packed_conv_transpose_eval(conv_transpose: nn.Module, input_size: list):
    assert not (conv_transpose.training), "Fusion only for eval!"
    return ConvTransposeUnary2d(
        conv_transpose,
        None,
        input_size,
    )


def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module, input_size: list):
    assert not (conv.training), "Fusion only for eval!"
    return ConvUnary2d(
        conv,
        unary,
        input_size,
    )


def fused_conv_binary_eval(conv: nn.Module, binary_op_name: str, input_size: list):
    assert not (conv.training), "Fusion only for eval!"
    return ConvBinary2d(
        conv,
        binary_op_name,
        input_size,
    )


def fused_conv_binary_unary_eval(
    conv_binary: nn.Module, unary: nn.Module, input_size: list
):
    assert not (conv_binary.training), "Fusion only for eval!"
    # reuse origin conv module, and just update its' unary attr.
    conv_binary._update_unary_params(unary)
    return conv_binary


def packed_linear_eval(linear: nn.Module, input_size: list):
    assert not (linear.training), "Fusion only for eval!"
    return PackedLinear(linear, input_size)


def fused_linear_unary_eval(linear: nn.Module, unary: nn.Module, input_size: list):
    assert not (linear.training), "Fusion only for eval!"
    return LinearUnary(
        linear,
        unary,
    )


def fused_linear_binary_eval(linear: nn.Module, attr: str, input_size: list):
    assert not (linear.training), "Fusion only for eval!"
    linear_binary = LinearBinary(
        linear,
        attr,
    )
    return linear_binary


def fused_conv_transpose_unary_eval(
    conv_transpose: nn.Module, unary: nn.Module, input_size: list
):
    assert not (conv_transpose.training), "Fusion only for eval!"
    return ConvTransposeUnary2d(
        conv_transpose,
        unary,
        input_size,
    )


def mkldnn_fuse_fx(gm: torch.fx.GraphModule, example_inputs):
    is_cpu = all(
        example_input.device == torch.device("cpu")
        for example_input in example_inputs
        if isinstance(example_input, torch.Tensor)
    )

    # make sure the autograd is disabled.
    if torch.is_grad_enabled():
        return gm
    if not (torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available()):
        return gm
    if not is_cpu:
        return gm
    # For binary fusion, we need to check inputs info to make sure
    # the binary inputs have same tensor info(device, dtype, and layout).

    fake_mode = fake_mode_from_tensors(example_inputs)
    ShapeProp(gm, fake_mode=fake_mode).propagate(*example_inputs)
    gm = fuse_unary(gm)
    gm = fuse_binary(gm)
    # why re-run fuse_unary? we want to enable conv+binary+unary fusion,
    # such as conv+add+relu for vision model.
    gm = fuse_unary(gm)
    if config.cpp.weight_prepack:
        gm = pack_module(gm)
    return gm


def create_unary_module(node: torch.fx.node):
    assert (
        node.op == "call_function" or node.op == "call_method"
    ), "The current node should be a function/method node"
    unary_map = {
        F.relu: nn.ReLU,
        F.sigmoid: nn.Sigmoid,
        F.tanh: nn.Tanh,
        F.hardswish: nn.Hardswish,
        F.leaky_relu: nn.LeakyReLU,
        F.hardtanh: nn.Hardtanh,
        F.gelu: nn.GELU,
        F.relu6: nn.ReLU6,
        F.silu: nn.SiLU,
        F.hardsigmoid: nn.Hardsigmoid,
        torch.relu: nn.ReLU,
        torch.sigmoid: nn.Sigmoid,
        torch.tanh: nn.Tanh,
        "relu": nn.ReLU,
        "sigmoid": nn.Sigmoid,
        "tanh": nn.Tanh,
    }
    return unary_map[node.target](*(node.args[1:]), **(node.kwargs))


def fuse_unary(gm: torch.fx.GraphModule):
    modules = dict(gm.named_modules())

    for unary_op, (
        computation_module,
        fuse_func,
    ) in itertools.product(unary_ops, computation_op_unary_op_fusion_map.items()):
        pattern = (computation_module, unary_op)
        for node in gm.graph.nodes:
            if matches_module_pattern(
                pattern, node, modules
            ) or matches_module_function_pattern(pattern, node, modules):
                if (
                    len(node.args[0].users) > 1
                ):  # Output of computation_node is used by other nodes
                    continue
                computation_node = modules[node.args[0].target]
                if node.op == "call_function" or node.op == "call_method":
                    # make sure unary function's inputs only one fx.node(others should be constant value).
                    if any(isinstance(v, torch.fx.Node) for v in node.args[1:]) or any(
                        isinstance(v, torch.fx.Node) for _, v in node.kwargs.items()
                    ):
                        continue
                    unary_node = create_unary_module(node)
                    unary_node.eval()
                else:
                    unary_node = modules[node.target]
                eval_mode = all(not n.training for n in [computation_node, unary_node])
                if not eval_mode:
                    continue
                # TODO: support padding str input("valid", "same").
                if type(computation_node) in [nn.Conv2d] and isinstance(
                    computation_node.padding, str
                ):
                    continue
                # TODO: support more conv+binary+unary fusion.
                if type(computation_node) in [ConvBinary2d] and type(
                    unary_node
                ) not in [nn.ReLU]:
                    continue
                # only fuse for linear when the dtype is bf16
                if type(computation_node) in [nn.Linear] and not is_bfloat16_module(
                    computation_node
                ):
                    continue
                # TODO: remove this when group depthwise ConvTranspose is supported
                if is_group_depthwise_conv_transpose(computation_node):
                    continue
                computation_node_input_size = (
                    node.args[0].args[0].meta.get("tensor_meta").shape
                )
                fused_module = fuse_func(
                    computation_node, unary_node, computation_node_input_size
                )
                replace_node_module(node.args[0], modules, fused_module)

                node.replace_all_uses_with(node.args[0])
                gm.graph.erase_node(node)
    gm.graph.lint()
    gm.recompile()
    return gm


def replace_and_fuse_for_binary(
    computation_node, node, fuse_func, attr, modules, index_node, index_pointwise
):
    computation_node_input_size = (
        node.args[index_node].args[0].meta.get("tensor_meta").shape
    )
    fused_module = fuse_func(computation_node, attr, computation_node_input_size)
    replace_node_module(node.args[index_node], modules, fused_module)
    node.args[index_node].args = node.args[index_node].args + (
        node.args[index_pointwise],
    )
    node.replace_all_uses_with(node.args[index_node])


def binary_inputs_meta_is_same(binary_node):
    tensor0_meta = binary_node.args[0].meta.get("tensor_meta")
    tensor1_meta = binary_node.args[1].meta.get("tensor_meta")
    if not tensor0_meta or not tensor1_meta:
        return False
    if (
        tensor0_meta.shape != tensor1_meta.shape
        or tensor0_meta.stride != tensor1_meta.stride
        or tensor0_meta.dtype != tensor1_meta.dtype
    ):
        return False

    return True


def fuse_binary(gm: torch.fx.GraphModule):
    modules = dict(gm.named_modules())
    for node in gm.graph.nodes:
        if check_node_is_binary(node) and check_binary_op_kwargs_is_default(node):
            for node_kind, fuse_func in computation_op_binary_op_fusion_map.items():
                if not isinstance(node.args[0], torch.fx.Node) or not isinstance(
                    node.args[1], torch.fx.Node
                ):
                    continue
                if not binary_inputs_meta_is_same(node):
                    continue
                attr = binary_attr[node.target]
                index_list = supported_index_list[attr]
                for index_dict in index_list:
                    index_node = index_dict["index_computation"]
                    index_pointwise = index_dict["index_pointwise"]
                    if check_node_kind(node.args[index_node], modules, node_kind):
                        if len(node.args[index_node].users) > 1:
                            continue
                        computation_node = modules[node.args[index_node].target]
                        if computation_node.training:
                            continue

                        # TODO: support padding str input("valid", "same").
                        if type(computation_node) in [nn.Conv2d] and isinstance(
                            computation_node.padding, str
                        ):
                            continue
                        # only fuse for linear when the dtype is bf16
                        if type(computation_node) in [
                            nn.Linear
                        ] and not is_bfloat16_module(computation_node):
                            continue
                        replace_and_fuse_for_binary(
                            computation_node,
                            node,
                            fuse_func,
                            attr if attr != "iadd" else "add",
                            modules,
                            index_node,
                            index_pointwise,
                        )
                        # Make sure the fused node is post node of node's inputs nodes.
                        node.append(node.args[index_node])
                        gm.graph.erase_node(node)
                        break
    gm.graph.lint()
    gm.recompile()
    return gm


def convert_outplace_to_inplace(gm: torch.fx.GraphModule):
    if not (torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available()):
        return gm
    # This function is about replace outplace with inplace for better performance(external call),
    # which happen after AOTAutograd.
    for node in gm.graph.nodes:
        if node.op == "call_function" and node.target in [
            torch.ops.mkldnn._convolution_pointwise.binary
        ]:
            # args[0] and args[1] is _convolution_pointwise.binary's input,
            # need to check whether args[1] can be written or not.
            if node.args[1].op in ["placeholder", "output"]:
                continue
            # TODO: node.args[1].users > 1, but node.args[1] never be used after current node.
            if len(node.args[1].users) > 1:
                continue
            if node.args[1] == node.args[0]:
                continue
            binary_attr = node.args[8]
            unary_attr = node.args[10]
            if binary_attr != "add" or unary_attr not in ["", "relu"]:
                continue
            node.target = torch.ops.mkldnn._convolution_pointwise_.binary
    gm.graph.lint()
    gm.recompile()
    return gm


def pack_module(gm: torch.fx.GraphModule):
    modules = dict(gm.named_modules())
    for node in gm.graph.nodes:
        if node.op == "call_module":
            assert isinstance(node.target, str)
            cur_module = modules[node.target]
            if type(cur_module) in computation_op_packed_map:
                if cur_module.training:
                    continue
                computation_node_input_meta = node.args[0].meta.get("tensor_meta")
                if computation_node_input_meta.dtype != torch.float32:
                    continue
                if type(cur_module) in [torch.nn.Linear] and not torch._C.has_mkl:
                    continue
                computation_node_input_size = computation_node_input_meta.shape
                if (
                    type(cur_module) in [torch.nn.Linear]
                    and len(computation_node_input_size) < 2
                ):
                    continue
                if type(cur_module) in [nn.Conv2d] and isinstance(
                    cur_module.padding, str
                ):
                    continue
                # TODO: remove this when group depthwise ConvTranspose is supported
                if is_group_depthwise_conv_transpose(cur_module):
                    continue
                new_module = computation_op_packed_map[type(cur_module)](
                    cur_module, computation_node_input_size
                )
                assert isinstance(new_module, nn.Module)
                replace_node_module(node, modules, new_module)
    gm.graph.lint()
    gm.recompile()
    return gm


computation_op_unary_op_fusion_map = {
    nn.Conv2d: fused_conv_unary_eval,
    nn.Linear: fused_linear_unary_eval,
    ConvBinary2d: fused_conv_binary_unary_eval,
    nn.ConvTranspose2d: fused_conv_transpose_unary_eval,
}


unary_modules_map = {
    nn.ReLU: UnaryAttr("relu"),
    nn.Sigmoid: UnaryAttr("sigmoid"),
    nn.Tanh: UnaryAttr("tanh"),
    nn.Hardswish: UnaryAttr("hardswish"),
    nn.LeakyReLU: UnaryAttr("leaky_relu", scalars_attr=["negative_slope"]),
    nn.Hardtanh: UnaryAttr("hardtanh", scalars_attr=["min_val", "max_val"]),
    nn.GELU: UnaryAttr("gelu", algorithm_attr="approximate"),
    nn.ReLU6: UnaryAttr("hardtanh", scalars_attr=["min_val", "max_val"]),
    nn.SiLU: UnaryAttr("swish"),
    nn.Hardsigmoid: UnaryAttr("hardsigmoid"),
}

unary_ops = [
    # modules
    nn.ReLU,
    nn.Sigmoid,
    nn.Tanh,
    nn.Hardswish,
    nn.LeakyReLU,
    nn.Hardtanh,
    nn.GELU,
    nn.ReLU6,
    nn.SiLU,
    nn.Hardsigmoid,
    # functional
    F.relu,
    F.sigmoid,
    F.tanh,
    F.hardswish,
    F.leaky_relu,
    F.hardtanh,
    F.gelu,
    F.relu6,
    F.silu,
    F.hardsigmoid,
    torch.relu,
    torch.sigmoid,
    torch.tanh,
    # methods (torch.Tensor.xxx)
    "relu",
    "sigmoid",
    "tanh",
]


binary_attr = {
    torch.add: "add",  # node.op == "call_function"
    "add": "add",  # node.op == "call_method"
    "add_": "iadd",  # node.op == "call_method"
    operator.add: "add",  # node.op == "call_function"
    operator.iadd: "iadd",  # node.op == "call_function"
    torch.sub: "sub",  # node.op == "call_function"
    "sub": "sub",  # node.op == "call_method"
    "sub_": "sub",  # node.op == "call_method"
    operator.sub: "sub",  # node.op == "call_function"
    operator.isub: "sub",  # node.op == "call_function"
}


computation_op_binary_op_fusion_map = {
    nn.Conv2d: fused_conv_binary_eval,
    nn.Linear: fused_linear_binary_eval,
}


computation_op_packed_map = {
    nn.Linear: packed_linear_eval,
    nn.Conv2d: packed_conv_eval,
    nn.ConvTranspose2d: packed_conv_transpose_eval,
}


# For add: we support conv/linear + other and other + conv
# For sub/add_/sub_, we only support conv/linear - other
# or conv/linear +(-)= other
supported_index_list = {
    "add": [
        {"index_computation": 0, "index_pointwise": 1},
        {"index_computation": 1, "index_pointwise": 0},
    ],
    "iadd": [{"index_computation": 0, "index_pointwise": 1}],
    "sub": [{"index_computation": 0, "index_pointwise": 1}],
}