linear.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. import math
  2. from typing import Any
  3. import torch
  4. from torch import Tensor
  5. from torch.nn.parameter import Parameter, UninitializedParameter
  6. from .. import functional as F
  7. from .. import init
  8. from .module import Module
  9. from .lazy import LazyModuleMixin
  10. __all__ = [
  11. 'Bilinear',
  12. 'Identity',
  13. 'LazyLinear',
  14. 'Linear',
  15. ]
  16. class Identity(Module):
  17. r"""A placeholder identity operator that is argument-insensitive.
  18. Args:
  19. args: any argument (unused)
  20. kwargs: any keyword argument (unused)
  21. Shape:
  22. - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
  23. - Output: :math:`(*)`, same shape as the input.
  24. Examples::
  25. >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
  26. >>> input = torch.randn(128, 20)
  27. >>> output = m(input)
  28. >>> print(output.size())
  29. torch.Size([128, 20])
  30. """
  31. def __init__(self, *args: Any, **kwargs: Any) -> None:
  32. super().__init__()
  33. def forward(self, input: Tensor) -> Tensor:
  34. return input
  35. class Linear(Module):
  36. r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
  37. This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
  38. On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
  39. Args:
  40. in_features: size of each input sample
  41. out_features: size of each output sample
  42. bias: If set to ``False``, the layer will not learn an additive bias.
  43. Default: ``True``
  44. Shape:
  45. - Input: :math:`(*, H_{in})` where :math:`*` means any number of
  46. dimensions including none and :math:`H_{in} = \text{in\_features}`.
  47. - Output: :math:`(*, H_{out})` where all but the last dimension
  48. are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
  49. Attributes:
  50. weight: the learnable weights of the module of shape
  51. :math:`(\text{out\_features}, \text{in\_features})`. The values are
  52. initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
  53. :math:`k = \frac{1}{\text{in\_features}}`
  54. bias: the learnable bias of the module of shape :math:`(\text{out\_features})`.
  55. If :attr:`bias` is ``True``, the values are initialized from
  56. :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
  57. :math:`k = \frac{1}{\text{in\_features}}`
  58. Examples::
  59. >>> m = nn.Linear(20, 30)
  60. >>> input = torch.randn(128, 20)
  61. >>> output = m(input)
  62. >>> print(output.size())
  63. torch.Size([128, 30])
  64. """
  65. __constants__ = ['in_features', 'out_features']
  66. in_features: int
  67. out_features: int
  68. weight: Tensor
  69. def __init__(self, in_features: int, out_features: int, bias: bool = True,
  70. device=None, dtype=None) -> None:
  71. factory_kwargs = {'device': device, 'dtype': dtype}
  72. super().__init__()
  73. self.in_features = in_features
  74. self.out_features = out_features
  75. self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
  76. if bias:
  77. self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
  78. else:
  79. self.register_parameter('bias', None)
  80. self.reset_parameters()
  81. def reset_parameters(self) -> None:
  82. # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
  83. # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
  84. # https://github.com/pytorch/pytorch/issues/57109
  85. init.kaiming_uniform_(self.weight, a=math.sqrt(5))
  86. if self.bias is not None:
  87. fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
  88. bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
  89. init.uniform_(self.bias, -bound, bound)
  90. def forward(self, input: Tensor) -> Tensor:
  91. return F.linear(input, self.weight, self.bias)
  92. def extra_repr(self) -> str:
  93. return 'in_features={}, out_features={}, bias={}'.format(
  94. self.in_features, self.out_features, self.bias is not None
  95. )
  96. # This class exists solely to avoid triggering an obscure error when scripting
  97. # an improperly quantized attention layer. See this issue for details:
  98. # https://github.com/pytorch/pytorch/issues/58969
  99. # TODO: fail fast on quantization API usage error, then remove this class
  100. # and replace uses of it with plain Linear
  101. class NonDynamicallyQuantizableLinear(Linear):
  102. def __init__(self, in_features: int, out_features: int, bias: bool = True,
  103. device=None, dtype=None) -> None:
  104. super().__init__(in_features, out_features, bias=bias,
  105. device=device, dtype=dtype)
  106. class Bilinear(Module):
  107. r"""Applies a bilinear transformation to the incoming data:
  108. :math:`y = x_1^T A x_2 + b`
  109. Args:
  110. in1_features: size of each first input sample
  111. in2_features: size of each second input sample
  112. out_features: size of each output sample
  113. bias: If set to False, the layer will not learn an additive bias.
  114. Default: ``True``
  115. Shape:
  116. - Input1: :math:`(*, H_{in1})` where :math:`H_{in1}=\text{in1\_features}` and
  117. :math:`*` means any number of additional dimensions including none. All but the last dimension
  118. of the inputs should be the same.
  119. - Input2: :math:`(*, H_{in2})` where :math:`H_{in2}=\text{in2\_features}`.
  120. - Output: :math:`(*, H_{out})` where :math:`H_{out}=\text{out\_features}`
  121. and all but the last dimension are the same shape as the input.
  122. Attributes:
  123. weight: the learnable weights of the module of shape
  124. :math:`(\text{out\_features}, \text{in1\_features}, \text{in2\_features})`.
  125. The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
  126. :math:`k = \frac{1}{\text{in1\_features}}`
  127. bias: the learnable bias of the module of shape :math:`(\text{out\_features})`.
  128. If :attr:`bias` is ``True``, the values are initialized from
  129. :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
  130. :math:`k = \frac{1}{\text{in1\_features}}`
  131. Examples::
  132. >>> m = nn.Bilinear(20, 30, 40)
  133. >>> input1 = torch.randn(128, 20)
  134. >>> input2 = torch.randn(128, 30)
  135. >>> output = m(input1, input2)
  136. >>> print(output.size())
  137. torch.Size([128, 40])
  138. """
  139. __constants__ = ['in1_features', 'in2_features', 'out_features']
  140. in1_features: int
  141. in2_features: int
  142. out_features: int
  143. weight: Tensor
  144. def __init__(self, in1_features: int, in2_features: int, out_features: int, bias: bool = True,
  145. device=None, dtype=None) -> None:
  146. factory_kwargs = {'device': device, 'dtype': dtype}
  147. super().__init__()
  148. self.in1_features = in1_features
  149. self.in2_features = in2_features
  150. self.out_features = out_features
  151. self.weight = Parameter(torch.empty((out_features, in1_features, in2_features), **factory_kwargs))
  152. if bias:
  153. self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
  154. else:
  155. self.register_parameter('bias', None)
  156. self.reset_parameters()
  157. def reset_parameters(self) -> None:
  158. bound = 1 / math.sqrt(self.weight.size(1))
  159. init.uniform_(self.weight, -bound, bound)
  160. if self.bias is not None:
  161. init.uniform_(self.bias, -bound, bound)
  162. def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
  163. return F.bilinear(input1, input2, self.weight, self.bias)
  164. def extra_repr(self) -> str:
  165. return 'in1_features={}, in2_features={}, out_features={}, bias={}'.format(
  166. self.in1_features, self.in2_features, self.out_features, self.bias is not None
  167. )
  168. class LazyLinear(LazyModuleMixin, Linear):
  169. r"""A :class:`torch.nn.Linear` module where `in_features` is inferred.
  170. In this module, the `weight` and `bias` are of :class:`torch.nn.UninitializedParameter`
  171. class. They will be initialized after the first call to ``forward`` is done and the
  172. module will become a regular :class:`torch.nn.Linear` module. The ``in_features`` argument
  173. of the :class:`Linear` is inferred from the ``input.shape[-1]``.
  174. Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
  175. on lazy modules and their limitations.
  176. Args:
  177. out_features: size of each output sample
  178. bias: If set to ``False``, the layer will not learn an additive bias.
  179. Default: ``True``
  180. Attributes:
  181. weight: the learnable weights of the module of shape
  182. :math:`(\text{out\_features}, \text{in\_features})`. The values are
  183. initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
  184. :math:`k = \frac{1}{\text{in\_features}}`
  185. bias: the learnable bias of the module of shape :math:`(\text{out\_features})`.
  186. If :attr:`bias` is ``True``, the values are initialized from
  187. :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
  188. :math:`k = \frac{1}{\text{in\_features}}`
  189. """
  190. cls_to_become = Linear # type: ignore[assignment]
  191. weight: UninitializedParameter
  192. bias: UninitializedParameter # type: ignore[assignment]
  193. def __init__(self, out_features: int, bias: bool = True,
  194. device=None, dtype=None) -> None:
  195. factory_kwargs = {'device': device, 'dtype': dtype}
  196. # bias is hardcoded to False to avoid creating tensor
  197. # that will soon be overwritten.
  198. super().__init__(0, 0, False)
  199. self.weight = UninitializedParameter(**factory_kwargs)
  200. self.out_features = out_features
  201. if bias:
  202. self.bias = UninitializedParameter(**factory_kwargs)
  203. def reset_parameters(self) -> None:
  204. if not self.has_uninitialized_params() and self.in_features != 0:
  205. super().reset_parameters()
  206. def initialize_parameters(self, input) -> None: # type: ignore[override]
  207. if self.has_uninitialized_params():
  208. with torch.no_grad():
  209. self.in_features = input.shape[-1]
  210. self.weight.materialize((self.out_features, self.in_features))
  211. if self.bias is not None:
  212. self.bias.materialize((self.out_features,))
  213. self.reset_parameters()
  214. # TODO: PartialLinear - maybe in sparse?