123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761 |
- import warnings
- from .distance import PairwiseDistance
- from .module import Module
- from .. import functional as F
- from .. import _reduction as _Reduction
- from torch import Tensor
- from typing import Callable, Optional
- __all__ = ['L1Loss', 'NLLLoss', 'NLLLoss2d', 'PoissonNLLLoss', 'GaussianNLLLoss', 'KLDivLoss',
- 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss', 'HingeEmbeddingLoss', 'MultiLabelMarginLoss',
- 'SmoothL1Loss', 'HuberLoss', 'SoftMarginLoss', 'CrossEntropyLoss', 'MultiLabelSoftMarginLoss',
- 'CosineEmbeddingLoss', 'MarginRankingLoss', 'MultiMarginLoss', 'TripletMarginLoss',
- 'TripletMarginWithDistanceLoss', 'CTCLoss']
- class _Loss(Module):
- reduction: str
- def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__()
- if size_average is not None or reduce is not None:
- self.reduction: str = _Reduction.legacy_get_string(size_average, reduce)
- else:
- self.reduction = reduction
- class _WeightedLoss(_Loss):
- def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(size_average, reduce, reduction)
- self.register_buffer('weight', weight)
- self.weight: Optional[Tensor]
- class L1Loss(_Loss):
- r"""Creates a criterion that measures the mean absolute error (MAE) between each element in
- the input :math:`x` and target :math:`y`.
- The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
- .. math::
- \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
- l_n = \left| x_n - y_n \right|,
- where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
- (default ``'mean'``), then:
- .. math::
- \ell(x, y) =
- \begin{cases}
- \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
- \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
- \end{cases}
- :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
- of :math:`n` elements each.
- The sum operation still operates over all the elements, and divides by :math:`n`.
- The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
- Supports real-valued and complex-valued inputs.
- Args:
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
- - Target: :math:`(*)`, same shape as the input.
- - Output: scalar. If :attr:`reduction` is ``'none'``, then
- :math:`(*)`, same shape as the input.
- Examples::
- >>> loss = nn.L1Loss()
- >>> input = torch.randn(3, 5, requires_grad=True)
- >>> target = torch.randn(3, 5)
- >>> output = loss(input, target)
- >>> output.backward()
- """
- __constants__ = ['reduction']
- def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(size_average, reduce, reduction)
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.l1_loss(input, target, reduction=self.reduction)
- class NLLLoss(_WeightedLoss):
- r"""The negative log likelihood loss. It is useful to train a classification
- problem with `C` classes.
- If provided, the optional argument :attr:`weight` should be a 1D Tensor assigning
- weight to each of the classes. This is particularly useful when you have an
- unbalanced training set.
- The `input` given through a forward call is expected to contain
- log-probabilities of each class. `input` has to be a Tensor of size either
- :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
- with :math:`K \geq 1` for the `K`-dimensional case. The latter is useful for
- higher dimension inputs, such as computing NLL loss per-pixel for 2D images.
- Obtaining log-probabilities in a neural network is easily achieved by
- adding a `LogSoftmax` layer in the last layer of your network.
- You may use `CrossEntropyLoss` instead, if you prefer not to add an extra
- layer.
- The `target` that this loss expects should be a class index in the range :math:`[0, C-1]`
- where `C = number of classes`; if `ignore_index` is specified, this loss also accepts
- this class index (this index may not necessarily be in the class range).
- The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
- .. math::
- \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
- l_n = - w_{y_n} x_{n,y_n}, \quad
- w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
- where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and
- :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
- (default ``'mean'``), then
- .. math::
- \ell(x, y) = \begin{cases}
- \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
- \text{if reduction} = \text{`mean';}\\
- \sum_{n=1}^N l_n, &
- \text{if reduction} = \text{`sum'.}
- \end{cases}
- Args:
- weight (Tensor, optional): a manual rescaling weight given to each
- class. If given, it has to be a Tensor of size `C`. Otherwise, it is
- treated as if having all ones.
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``None``
- ignore_index (int, optional): Specifies a target value that is ignored
- and does not contribute to the input gradient. When
- :attr:`size_average` is ``True``, the loss is averaged over
- non-ignored targets.
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``None``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
- be applied, ``'mean'``: the weighted mean of the output is taken,
- ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in
- the meantime, specifying either of those two args will override
- :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input: :math:`(N, C)` or :math:`(C)`, where `C = number of classes`, or
- :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
- in the case of `K`-dimensional loss.
- - Target: :math:`(N)` or :math:`()`, where each value is
- :math:`0 \leq \text{targets}[i] \leq C-1`, or
- :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of
- K-dimensional loss.
- - Output: If :attr:`reduction` is ``'none'``, shape :math:`(N)` or
- :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss.
- Otherwise, scalar.
- Examples::
- >>> m = nn.LogSoftmax(dim=1)
- >>> loss = nn.NLLLoss()
- >>> # input is of size N x C = 3 x 5
- >>> input = torch.randn(3, 5, requires_grad=True)
- >>> # each element in target has to have 0 <= value < C
- >>> target = torch.tensor([1, 0, 4])
- >>> output = loss(m(input), target)
- >>> output.backward()
- >>>
- >>>
- >>> # 2D loss example (used, for example, with image inputs)
- >>> N, C = 5, 4
- >>> loss = nn.NLLLoss()
- >>> # input is of size N x C x height x width
- >>> data = torch.randn(N, 16, 10, 10)
- >>> conv = nn.Conv2d(16, C, (3, 3))
- >>> m = nn.LogSoftmax(dim=1)
- >>> # each element in target has to have 0 <= value < C
- >>> target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
- >>> output = loss(m(conv(data)), target)
- >>> output.backward()
- """
- __constants__ = ['ignore_index', 'reduction']
- ignore_index: int
- def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
- reduce=None, reduction: str = 'mean') -> None:
- super().__init__(weight, size_average, reduce, reduction)
- self.ignore_index = ignore_index
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
- class NLLLoss2d(NLLLoss):
- def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
- reduce=None, reduction: str = 'mean') -> None:
- warnings.warn("NLLLoss2d has been deprecated. "
- "Please use NLLLoss instead as a drop-in replacement and see "
- "https://pytorch.org/docs/master/nn.html#torch.nn.NLLLoss for more details.")
- super().__init__(weight, size_average, ignore_index, reduce, reduction)
- class PoissonNLLLoss(_Loss):
- r"""Negative log likelihood loss with Poisson distribution of target.
- The loss can be described as:
- .. math::
- \text{target} \sim \mathrm{Poisson}(\text{input})
- \text{loss}(\text{input}, \text{target}) = \text{input} - \text{target} * \log(\text{input})
- + \log(\text{target!})
- The last term can be omitted or approximated with Stirling formula. The
- approximation is used for target values more than 1. For targets less or
- equal to 1 zeros are added to the loss.
- Args:
- log_input (bool, optional): if ``True`` the loss is computed as
- :math:`\exp(\text{input}) - \text{target}*\text{input}`, if ``False`` the loss is
- :math:`\text{input} - \text{target}*\log(\text{input}+\text{eps})`.
- full (bool, optional): whether to compute full loss, i. e. to add the
- Stirling approximation term
- .. math::
- \text{target}*\log(\text{target}) - \text{target} + 0.5 * \log(2\pi\text{target}).
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when
- :attr:`log_input = False`. Default: 1e-8
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Examples::
- >>> loss = nn.PoissonNLLLoss()
- >>> log_input = torch.randn(5, 2, requires_grad=True)
- >>> target = torch.randn(5, 2)
- >>> output = loss(log_input, target)
- >>> output.backward()
- Shape:
- - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
- - Target: :math:`(*)`, same shape as the input.
- - Output: scalar by default. If :attr:`reduction` is ``'none'``, then :math:`(*)`,
- the same shape as the input.
- """
- __constants__ = ['log_input', 'full', 'eps', 'reduction']
- log_input: bool
- full: bool
- eps: float
- def __init__(self, log_input: bool = True, full: bool = False, size_average=None,
- eps: float = 1e-8, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(size_average, reduce, reduction)
- self.log_input = log_input
- self.full = full
- self.eps = eps
- def forward(self, log_input: Tensor, target: Tensor) -> Tensor:
- return F.poisson_nll_loss(log_input, target, log_input=self.log_input, full=self.full,
- eps=self.eps, reduction=self.reduction)
- class GaussianNLLLoss(_Loss):
- r"""Gaussian negative log likelihood loss.
- The targets are treated as samples from Gaussian distributions with
- expectations and variances predicted by the neural network. For a
- ``target`` tensor modelled as having Gaussian distribution with a tensor
- of expectations ``input`` and a tensor of positive variances ``var`` the loss is:
- .. math::
- \text{loss} = \frac{1}{2}\left(\log\left(\text{max}\left(\text{var},
- \ \text{eps}\right)\right) + \frac{\left(\text{input} - \text{target}\right)^2}
- {\text{max}\left(\text{var}, \ \text{eps}\right)}\right) + \text{const.}
- where :attr:`eps` is used for stability. By default, the constant term of
- the loss function is omitted unless :attr:`full` is ``True``. If ``var`` is not the same
- size as ``input`` (due to a homoscedastic assumption), it must either have a final dimension
- of 1 or have one fewer dimension (with all other sizes being the same) for correct broadcasting.
- Args:
- full (bool, optional): include the constant term in the loss
- calculation. Default: ``False``.
- eps (float, optional): value used to clamp ``var`` (see note below), for
- stability. Default: 1e-6.
- reduction (str, optional): specifies the reduction to apply to the
- output:``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction
- will be applied, ``'mean'``: the output is the average of all batch
- member losses, ``'sum'``: the output is the sum of all batch member
- losses. Default: ``'mean'``.
- Shape:
- - Input: :math:`(N, *)` or :math:`(*)` where :math:`*` means any number of additional
- dimensions
- - Target: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input
- but with one dimension equal to 1 (to allow for broadcasting)
- - Var: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input but
- with one dimension equal to 1, or same shape as the input but with one fewer
- dimension (to allow for broadcasting)
- - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or
- ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
- shape as the input
- Examples::
- >>> loss = nn.GaussianNLLLoss()
- >>> input = torch.randn(5, 2, requires_grad=True)
- >>> target = torch.randn(5, 2)
- >>> var = torch.ones(5, 2, requires_grad=True) # heteroscedastic
- >>> output = loss(input, target, var)
- >>> output.backward()
- >>> loss = nn.GaussianNLLLoss()
- >>> input = torch.randn(5, 2, requires_grad=True)
- >>> target = torch.randn(5, 2)
- >>> var = torch.ones(5, 1, requires_grad=True) # homoscedastic
- >>> output = loss(input, target, var)
- >>> output.backward()
- Note:
- The clamping of ``var`` is ignored with respect to autograd, and so the
- gradients are unaffected by it.
- Reference:
- Nix, D. A. and Weigend, A. S., "Estimating the mean and variance of the
- target probability distribution", Proceedings of 1994 IEEE International
- Conference on Neural Networks (ICNN'94), Orlando, FL, USA, 1994, pp. 55-60
- vol.1, doi: 10.1109/ICNN.1994.374138.
- """
- __constants__ = ['full', 'eps', 'reduction']
- full: bool
- eps: float
- def __init__(self, *, full: bool = False, eps: float = 1e-6, reduction: str = 'mean') -> None:
- super().__init__(None, None, reduction)
- self.full = full
- self.eps = eps
- def forward(self, input: Tensor, target: Tensor, var: Tensor) -> Tensor:
- return F.gaussian_nll_loss(input, target, var, full=self.full, eps=self.eps, reduction=self.reduction)
- class KLDivLoss(_Loss):
- r"""The Kullback-Leibler divergence loss.
- For tensors of the same shape :math:`y_{\text{pred}},\ y_{\text{true}}`,
- where :math:`y_{\text{pred}}` is the :attr:`input` and :math:`y_{\text{true}}` is the
- :attr:`target`, we define the **pointwise KL-divergence** as
- .. math::
- L(y_{\text{pred}},\ y_{\text{true}})
- = y_{\text{true}} \cdot \log \frac{y_{\text{true}}}{y_{\text{pred}}}
- = y_{\text{true}} \cdot (\log y_{\text{true}} - \log y_{\text{pred}})
- To avoid underflow issues when computing this quantity, this loss expects the argument
- :attr:`input` in the log-space. The argument :attr:`target` may also be provided in the
- log-space if :attr:`log_target`\ `= True`.
- To summarise, this function is roughly equivalent to computing
- .. code-block:: python
- if not log_target: # default
- loss_pointwise = target * (target.log() - input)
- else:
- loss_pointwise = target.exp() * (target - input)
- and then reducing this result depending on the argument :attr:`reduction` as
- .. code-block:: python
- if reduction == "mean": # default
- loss = loss_pointwise.mean()
- elif reduction == "batchmean": # mathematically correct
- loss = loss_pointwise.sum() / input.size(0)
- elif reduction == "sum":
- loss = loss_pointwise.sum()
- else: # reduction == "none"
- loss = loss_pointwise
- .. note::
- As all the other losses in PyTorch, this function expects the first argument,
- :attr:`input`, to be the output of the model (e.g. the neural network)
- and the second, :attr:`target`, to be the observations in the dataset.
- This differs from the standard mathematical notation :math:`KL(P\ ||\ Q)` where
- :math:`P` denotes the distribution of the observations and :math:`Q` denotes the model.
- .. warning::
- :attr:`reduction`\ `= "mean"` doesn't return the true KL divergence value, please use
- :attr:`reduction`\ `= "batchmean"` which aligns with the mathematical definition.
- In a future release, `"mean"` will be changed to be the same as `"batchmean"`.
- Args:
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to `False`, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is `False`. Default: `True`
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is `False`, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: `True`
- reduction (str, optional): Specifies the reduction to apply to the output. Default: `"mean"`
- log_target (bool, optional): Specifies whether `target` is the log space. Default: `False`
- Shape:
- - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
- - Target: :math:`(*)`, same shape as the input.
- - Output: scalar by default. If :attr:`reduction` is `'none'`, then :math:`(*)`,
- same shape as the input.
- Examples::
- >>> import torch.nn.functional as F
- >>> kl_loss = nn.KLDivLoss(reduction="batchmean")
- >>> # input should be a distribution in the log space
- >>> input = F.log_softmax(torch.randn(3, 5, requires_grad=True), dim=1)
- >>> # Sample a batch of distributions. Usually this would come from the dataset
- >>> target = F.softmax(torch.rand(3, 5), dim=1)
- >>> output = kl_loss(input, target)
- >>> kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True)
- >>> log_target = F.log_softmax(torch.rand(3, 5), dim=1)
- >>> output = kl_loss(input, log_target)
- """
- __constants__ = ['reduction']
- def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', log_target: bool = False) -> None:
- super().__init__(size_average, reduce, reduction)
- self.log_target = log_target
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.kl_div(input, target, reduction=self.reduction, log_target=self.log_target)
- class MSELoss(_Loss):
- r"""Creates a criterion that measures the mean squared error (squared L2 norm) between
- each element in the input :math:`x` and target :math:`y`.
- The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
- .. math::
- \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
- l_n = \left( x_n - y_n \right)^2,
- where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
- (default ``'mean'``), then:
- .. math::
- \ell(x, y) =
- \begin{cases}
- \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
- \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
- \end{cases}
- :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
- of :math:`n` elements each.
- The mean operation still operates over all the elements, and divides by :math:`n`.
- The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
- Args:
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
- - Target: :math:`(*)`, same shape as the input.
- Examples::
- >>> loss = nn.MSELoss()
- >>> input = torch.randn(3, 5, requires_grad=True)
- >>> target = torch.randn(3, 5)
- >>> output = loss(input, target)
- >>> output.backward()
- """
- __constants__ = ['reduction']
- def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(size_average, reduce, reduction)
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.mse_loss(input, target, reduction=self.reduction)
- class BCELoss(_WeightedLoss):
- r"""Creates a criterion that measures the Binary Cross Entropy between the target and
- the input probabilities:
- The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
- .. math::
- \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
- l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],
- where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
- (default ``'mean'``), then
- .. math::
- \ell(x, y) = \begin{cases}
- \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
- \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
- \end{cases}
- This is used for measuring the error of a reconstruction in for example
- an auto-encoder. Note that the targets :math:`y` should be numbers
- between 0 and 1.
- Notice that if :math:`x_n` is either 0 or 1, one of the log terms would be
- mathematically undefined in the above loss equation. PyTorch chooses to set
- :math:`\log (0) = -\infty`, since :math:`\lim_{x\to 0} \log (x) = -\infty`.
- However, an infinite term in the loss equation is not desirable for several reasons.
- For one, if either :math:`y_n = 0` or :math:`(1 - y_n) = 0`, then we would be
- multiplying 0 with infinity. Secondly, if we have an infinite loss value, then
- we would also have an infinite term in our gradient, since
- :math:`\lim_{x\to 0} \frac{d}{dx} \log (x) = \infty`.
- This would make BCELoss's backward method nonlinear with respect to :math:`x_n`,
- and using it for things like linear regression would not be straight-forward.
- Our solution is that BCELoss clamps its log function outputs to be greater than
- or equal to -100. This way, we can always have a finite loss value and a linear
- backward method.
- Args:
- weight (Tensor, optional): a manual rescaling weight given to the loss
- of each batch element. If given, has to be a Tensor of size `nbatch`.
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
- - Target: :math:`(*)`, same shape as the input.
- - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
- shape as input.
- Examples::
- >>> m = nn.Sigmoid()
- >>> loss = nn.BCELoss()
- >>> input = torch.randn(3, requires_grad=True)
- >>> target = torch.empty(3).random_(2)
- >>> output = loss(m(input), target)
- >>> output.backward()
- """
- __constants__ = ['reduction']
- def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(weight, size_average, reduce, reduction)
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
- class BCEWithLogitsLoss(_Loss):
- r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single
- class. This version is more numerically stable than using a plain `Sigmoid`
- followed by a `BCELoss` as, by combining the operations into one layer,
- we take advantage of the log-sum-exp trick for numerical stability.
- The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
- .. math::
- \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
- l_n = - w_n \left[ y_n \cdot \log \sigma(x_n)
- + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
- where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
- (default ``'mean'``), then
- .. math::
- \ell(x, y) = \begin{cases}
- \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
- \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
- \end{cases}
- This is used for measuring the error of a reconstruction in for example
- an auto-encoder. Note that the targets `t[i]` should be numbers
- between 0 and 1.
- It's possible to trade off recall and precision by adding weights to positive examples.
- In the case of multi-label classification the loss can be described as:
- .. math::
- \ell_c(x, y) = L_c = \{l_{1,c},\dots,l_{N,c}\}^\top, \quad
- l_{n,c} = - w_{n,c} \left[ p_c y_{n,c} \cdot \log \sigma(x_{n,c})
- + (1 - y_{n,c}) \cdot \log (1 - \sigma(x_{n,c})) \right],
- where :math:`c` is the class number (:math:`c > 1` for multi-label binary classification,
- :math:`c = 1` for single-label binary classification),
- :math:`n` is the number of the sample in the batch and
- :math:`p_c` is the weight of the positive answer for the class :math:`c`.
- :math:`p_c > 1` increases the recall, :math:`p_c < 1` increases the precision.
- For example, if a dataset contains 100 positive and 300 negative examples of a single class,
- then `pos_weight` for the class should be equal to :math:`\frac{300}{100}=3`.
- The loss would act as if the dataset contains :math:`3\times 100=300` positive examples.
- Examples::
- >>> target = torch.ones([10, 64], dtype=torch.float32) # 64 classes, batch size = 10
- >>> output = torch.full([10, 64], 1.5) # A prediction (logit)
- >>> pos_weight = torch.ones([64]) # All weights are equal to 1
- >>> criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
- >>> criterion(output, target) # -log(sigmoid(1.5))
- tensor(0.20...)
- Args:
- weight (Tensor, optional): a manual rescaling weight given to the loss
- of each batch element. If given, has to be a Tensor of size `nbatch`.
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- pos_weight (Tensor, optional): a weight of positive examples.
- Must be a vector with length equal to the number of classes.
- Shape:
- - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
- - Target: :math:`(*)`, same shape as the input.
- - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
- shape as input.
- Examples::
- >>> loss = nn.BCEWithLogitsLoss()
- >>> input = torch.randn(3, requires_grad=True)
- >>> target = torch.empty(3).random_(2)
- >>> output = loss(input, target)
- >>> output.backward()
- """
- def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean',
- pos_weight: Optional[Tensor] = None) -> None:
- super().__init__(size_average, reduce, reduction)
- self.register_buffer('weight', weight)
- self.register_buffer('pos_weight', pos_weight)
- self.weight: Optional[Tensor]
- self.pos_weight: Optional[Tensor]
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.binary_cross_entropy_with_logits(input, target,
- self.weight,
- pos_weight=self.pos_weight,
- reduction=self.reduction)
- class HingeEmbeddingLoss(_Loss):
- r"""Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`
- (containing 1 or -1).
- This is usually used for measuring whether two inputs are similar or
- dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
- used for learning nonlinear embeddings or semi-supervised learning.
- The loss function for :math:`n`-th sample in the mini-batch is
- .. math::
- l_n = \begin{cases}
- x_n, & \text{if}\; y_n = 1,\\
- \max \{0, \Delta - x_n\}, & \text{if}\; y_n = -1,
- \end{cases}
- and the total loss functions is
- .. math::
- \ell(x, y) = \begin{cases}
- \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
- \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
- \end{cases}
- where :math:`L = \{l_1,\dots,l_N\}^\top`.
- Args:
- margin (float, optional): Has a default value of `1`.
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input: :math:`(*)` where :math:`*` means, any number of dimensions. The sum operation
- operates over all the elements.
- - Target: :math:`(*)`, same shape as the input
- - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input
- """
- __constants__ = ['margin', 'reduction']
- margin: float
- def __init__(self, margin: float = 1.0, size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(size_average, reduce, reduction)
- self.margin = margin
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.hinge_embedding_loss(input, target, margin=self.margin, reduction=self.reduction)
- class MultiLabelMarginLoss(_Loss):
- r"""Creates a criterion that optimizes a multi-class multi-classification
- hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
- and output :math:`y` (which is a 2D `Tensor` of target class indices).
- For each sample in the mini-batch:
- .. math::
- \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
- where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \
- :math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \
- :math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \
- and :math:`i \neq y[j]` for all :math:`i` and :math:`j`.
- :math:`y` and :math:`x` must have the same size.
- The criterion only considers a contiguous block of non-negative targets that
- starts at the front.
- This allows for different samples to have variable amounts of target classes.
- Args:
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input: :math:`(C)` or :math:`(N, C)` where `N` is the batch size and `C`
- is the number of classes.
- - Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
- - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
- Examples::
- >>> loss = nn.MultiLabelMarginLoss()
- >>> x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
- >>> # for target y, only consider labels 3 and 0, not after label -1
- >>> y = torch.LongTensor([[3, 0, -1, 1]])
- >>> # 0.25 * ((1-(0.1-0.2)) + (1-(0.1-0.4)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
- >>> loss(x, y)
- tensor(0.85...)
- """
- __constants__ = ['reduction']
- def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(size_average, reduce, reduction)
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.multilabel_margin_loss(input, target, reduction=self.reduction)
- class SmoothL1Loss(_Loss):
- r"""Creates a criterion that uses a squared term if the absolute
- element-wise error falls below beta and an L1 term otherwise.
- It is less sensitive to outliers than :class:`torch.nn.MSELoss` and in some cases
- prevents exploding gradients (e.g. see the paper `Fast R-CNN`_ by Ross Girshick).
- For a batch of size :math:`N`, the unreduced loss can be described as:
- .. math::
- \ell(x, y) = L = \{l_1, ..., l_N\}^T
- with
- .. math::
- l_n = \begin{cases}
- 0.5 (x_n - y_n)^2 / beta, & \text{if } |x_n - y_n| < beta \\
- |x_n - y_n| - 0.5 * beta, & \text{otherwise }
- \end{cases}
- If `reduction` is not `none`, then:
- .. math::
- \ell(x, y) =
- \begin{cases}
- \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
- \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
- \end{cases}
- .. note::
- Smooth L1 loss can be seen as exactly :class:`L1Loss`, but with the :math:`|x - y| < beta`
- portion replaced with a quadratic function such that its slope is 1 at :math:`|x - y| = beta`.
- The quadratic segment smooths the L1 loss near :math:`|x - y| = 0`.
- .. note::
- Smooth L1 loss is closely related to :class:`HuberLoss`, being
- equivalent to :math:`huber(x, y) / beta` (note that Smooth L1's beta hyper-parameter is
- also known as delta for Huber). This leads to the following differences:
- * As beta -> 0, Smooth L1 loss converges to :class:`L1Loss`, while :class:`HuberLoss`
- converges to a constant 0 loss. When beta is 0, Smooth L1 loss is equivalent to L1 loss.
- * As beta -> :math:`+\infty`, Smooth L1 loss converges to a constant 0 loss, while
- :class:`HuberLoss` converges to :class:`MSELoss`.
- * For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant slope of 1.
- For :class:`HuberLoss`, the slope of the L1 segment is beta.
- .. _`Fast R-CNN`: https://arxiv.org/abs/1504.08083
- Args:
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- beta (float, optional): Specifies the threshold at which to change between L1 and L2 loss.
- The value must be non-negative. Default: 1.0
- Shape:
- - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
- - Target: :math:`(*)`, same shape as the input.
- - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
- """
- __constants__ = ['reduction']
- def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', beta: float = 1.0) -> None:
- super().__init__(size_average, reduce, reduction)
- self.beta = beta
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
- class HuberLoss(_Loss):
- r"""Creates a criterion that uses a squared term if the absolute
- element-wise error falls below delta and a delta-scaled L1 term otherwise.
- This loss combines advantages of both :class:`L1Loss` and :class:`MSELoss`; the
- delta-scaled L1 region makes the loss less sensitive to outliers than :class:`MSELoss`,
- while the L2 region provides smoothness over :class:`L1Loss` near 0. See
- `Huber loss <https://en.wikipedia.org/wiki/Huber_loss>`_ for more information.
- For a batch of size :math:`N`, the unreduced loss can be described as:
- .. math::
- \ell(x, y) = L = \{l_1, ..., l_N\}^T
- with
- .. math::
- l_n = \begin{cases}
- 0.5 (x_n - y_n)^2, & \text{if } |x_n - y_n| < delta \\
- delta * (|x_n - y_n| - 0.5 * delta), & \text{otherwise }
- \end{cases}
- If `reduction` is not `none`, then:
- .. math::
- \ell(x, y) =
- \begin{cases}
- \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
- \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
- \end{cases}
- .. note::
- When delta is set to 1, this loss is equivalent to :class:`SmoothL1Loss`.
- In general, this loss differs from :class:`SmoothL1Loss` by a factor of delta (AKA beta
- in Smooth L1).
- See :class:`SmoothL1Loss` for additional discussion on the differences in behavior
- between the two losses.
- Args:
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
- delta (float, optional): Specifies the threshold at which to change between delta-scaled L1 and L2 loss.
- The value must be positive. Default: 1.0
- Shape:
- - Input: :math:`(*)` where :math:`*` means any number of dimensions.
- - Target: :math:`(*)`, same shape as the input.
- - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
- """
- __constants__ = ['reduction', 'delta']
- def __init__(self, reduction: str = 'mean', delta: float = 1.0) -> None:
- super().__init__(reduction=reduction)
- self.delta = delta
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.huber_loss(input, target, reduction=self.reduction, delta=self.delta)
- class SoftMarginLoss(_Loss):
- r"""Creates a criterion that optimizes a two-class classification
- logistic loss between input tensor :math:`x` and target tensor :math:`y`
- (containing 1 or -1).
- .. math::
- \text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()}
- Args:
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
- - Target: :math:`(*)`, same shape as the input.
- - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
- shape as input.
- """
- __constants__ = ['reduction']
- def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(size_average, reduce, reduction)
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.soft_margin_loss(input, target, reduction=self.reduction)
- class CrossEntropyLoss(_WeightedLoss):
- r"""This criterion computes the cross entropy loss between input logits
- and target.
- It is useful when training a classification problem with `C` classes.
- If provided, the optional argument :attr:`weight` should be a 1D `Tensor`
- assigning weight to each of the classes.
- This is particularly useful when you have an unbalanced training set.
- The `input` is expected to contain the unnormalized logits for each class (which do `not` need
- to be positive or sum to 1, in general).
- `input` has to be a Tensor of size :math:`(C)` for unbatched input,
- :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` for the
- `K`-dimensional case. The last being useful for higher dimension inputs, such
- as computing cross entropy loss per-pixel for 2D images.
- The `target` that this criterion expects should contain either:
- - Class indices in the range :math:`[0, C)` where :math:`C` is the number of classes; if
- `ignore_index` is specified, this loss also accepts this class index (this index
- may not necessarily be in the class range). The unreduced (i.e. with :attr:`reduction`
- set to ``'none'``) loss for this case can be described as:
- .. math::
- \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
- l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})}
- \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}
- where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight,
- :math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as
- :math:`d_1, ..., d_k` for the `K`-dimensional case. If
- :attr:`reduction` is not ``'none'`` (default ``'mean'``), then
- .. math::
- \ell(x, y) = \begin{cases}
- \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n} \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}} l_n, &
- \text{if reduction} = \text{`mean';}\\
- \sum_{n=1}^N l_n, &
- \text{if reduction} = \text{`sum'.}
- \end{cases}
- Note that this case is equivalent to the combination of :class:`~torch.nn.LogSoftmax` and
- :class:`~torch.nn.NLLLoss`.
- - Probabilities for each class; useful when labels beyond a single class per minibatch item
- are required, such as for blended labels, label smoothing, etc. The unreduced (i.e. with
- :attr:`reduction` set to ``'none'``) loss for this case can be described as:
- .. math::
- \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
- l_n = - \sum_{c=1}^C w_c \log \frac{\exp(x_{n,c})}{\sum_{i=1}^C \exp(x_{n,i})} y_{n,c}
- where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight,
- :math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as
- :math:`d_1, ..., d_k` for the `K`-dimensional case. If
- :attr:`reduction` is not ``'none'`` (default ``'mean'``), then
- .. math::
- \ell(x, y) = \begin{cases}
- \frac{\sum_{n=1}^N l_n}{N}, &
- \text{if reduction} = \text{`mean';}\\
- \sum_{n=1}^N l_n, &
- \text{if reduction} = \text{`sum'.}
- \end{cases}
- .. note::
- The performance of this criterion is generally better when `target` contains class
- indices, as this allows for optimized computation. Consider providing `target` as
- class probabilities only when a single class label per minibatch item is too restrictive.
- Args:
- weight (Tensor, optional): a manual rescaling weight given to each class.
- If given, has to be a Tensor of size `C`
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- ignore_index (int, optional): Specifies a target value that is ignored
- and does not contribute to the input gradient. When :attr:`size_average` is
- ``True``, the loss is averaged over non-ignored targets. Note that
- :attr:`ignore_index` is only applicable when the target contains class indices.
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
- be applied, ``'mean'``: the weighted mean of the output is taken,
- ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in
- the meantime, specifying either of those two args will override
- :attr:`reduction`. Default: ``'mean'``
- label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
- of smoothing when computing the loss, where 0.0 means no smoothing. The targets
- become a mixture of the original ground truth and a uniform distribution as described in
- `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
- Shape:
- - Input: Shape :math:`(C)`, :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
- in the case of `K`-dimensional loss.
- - Target: If containing class indices, shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with
- :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`.
- If containing class probabilities, same shape as the input and each value should be between :math:`[0, 1]`.
- - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
- in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar.
- where:
- .. math::
- \begin{aligned}
- C ={} & \text{number of classes} \\
- N ={} & \text{batch size} \\
- \end{aligned}
- Examples::
- >>> # Example of target with class indices
- >>> loss = nn.CrossEntropyLoss()
- >>> input = torch.randn(3, 5, requires_grad=True)
- >>> target = torch.empty(3, dtype=torch.long).random_(5)
- >>> output = loss(input, target)
- >>> output.backward()
- >>>
- >>> # Example of target with class probabilities
- >>> input = torch.randn(3, 5, requires_grad=True)
- >>> target = torch.randn(3, 5).softmax(dim=1)
- >>> output = loss(input, target)
- >>> output.backward()
- """
- __constants__ = ['ignore_index', 'reduction', 'label_smoothing']
- ignore_index: int
- label_smoothing: float
- def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
- reduce=None, reduction: str = 'mean', label_smoothing: float = 0.0) -> None:
- super().__init__(weight, size_average, reduce, reduction)
- self.ignore_index = ignore_index
- self.label_smoothing = label_smoothing
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.cross_entropy(input, target, weight=self.weight,
- ignore_index=self.ignore_index, reduction=self.reduction,
- label_smoothing=self.label_smoothing)
- class MultiLabelSoftMarginLoss(_WeightedLoss):
- r"""Creates a criterion that optimizes a multi-label one-versus-all
- loss based on max-entropy, between input :math:`x` and target :math:`y` of size
- :math:`(N, C)`.
- For each sample in the minibatch:
- .. math::
- loss(x, y) = - \frac{1}{C} * \sum_i y[i] * \log((1 + \exp(-x[i]))^{-1})
- + (1-y[i]) * \log\left(\frac{\exp(-x[i])}{(1 + \exp(-x[i]))}\right)
- where :math:`i \in \left\{0, \; \cdots , \; \text{x.nElement}() - 1\right\}`,
- :math:`y[i] \in \left\{0, \; 1\right\}`.
- Args:
- weight (Tensor, optional): a manual rescaling weight given to each
- class. If given, it has to be a Tensor of size `C`. Otherwise, it is
- treated as if having all ones.
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input: :math:`(N, C)` where `N` is the batch size and `C` is the number of classes.
- - Target: :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
- - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
- """
- __constants__ = ['reduction']
- def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(weight, size_average, reduce, reduction)
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.multilabel_soft_margin_loss(input, target, weight=self.weight, reduction=self.reduction)
- class CosineEmbeddingLoss(_Loss):
- r"""Creates a criterion that measures the loss given input tensors
- :math:`x_1`, :math:`x_2` and a `Tensor` label :math:`y` with values 1 or -1.
- This is used for measuring whether two inputs are similar or dissimilar,
- using the cosine similarity, and is typically used for learning nonlinear
- embeddings or semi-supervised learning.
- The loss function for each sample is:
- .. math::
- \text{loss}(x, y) =
- \begin{cases}
- 1 - \cos(x_1, x_2), & \text{if } y = 1 \\
- \max(0, \cos(x_1, x_2) - \text{margin}), & \text{if } y = -1
- \end{cases}
- Args:
- margin (float, optional): Should be a number from :math:`-1` to :math:`1`,
- :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
- default value is :math:`0`.
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input1: :math:`(N, D)` or :math:`(D)`, where `N` is the batch size and `D` is the embedding dimension.
- - Input2: :math:`(N, D)` or :math:`(D)`, same shape as Input1.
- - Target: :math:`(N)` or :math:`()`.
- - Output: If :attr:`reduction` is ``'none'``, then :math:`(N)`, otherwise scalar.
- """
- __constants__ = ['margin', 'reduction']
- margin: float
- def __init__(self, margin: float = 0., size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(size_average, reduce, reduction)
- self.margin = margin
- def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
- return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
- class MarginRankingLoss(_Loss):
- r"""Creates a criterion that measures the loss given
- inputs :math:`x1`, :math:`x2`, two 1D mini-batch or 0D `Tensors`,
- and a label 1D mini-batch or 0D `Tensor` :math:`y` (containing 1 or -1).
- If :math:`y = 1` then it assumed the first input should be ranked higher
- (have a larger value) than the second input, and vice-versa for :math:`y = -1`.
- The loss function for each pair of samples in the mini-batch is:
- .. math::
- \text{loss}(x1, x2, y) = \max(0, -y * (x1 - x2) + \text{margin})
- Args:
- margin (float, optional): Has a default value of :math:`0`.
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input1: :math:`(N)` or :math:`()` where `N` is the batch size.
- - Input2: :math:`(N)` or :math:`()`, same shape as the Input1.
- - Target: :math:`(N)` or :math:`()`, same shape as the inputs.
- - Output: scalar. If :attr:`reduction` is ``'none'`` and Input size is not :math:`()`, then :math:`(N)`.
- Examples::
- >>> loss = nn.MarginRankingLoss()
- >>> input1 = torch.randn(3, requires_grad=True)
- >>> input2 = torch.randn(3, requires_grad=True)
- >>> target = torch.randn(3).sign()
- >>> output = loss(input1, input2, target)
- >>> output.backward()
- """
- __constants__ = ['margin', 'reduction']
- margin: float
- def __init__(self, margin: float = 0., size_average=None, reduce=None, reduction: str = 'mean') -> None:
- super().__init__(size_average, reduce, reduction)
- self.margin = margin
- def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
- return F.margin_ranking_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
- class MultiMarginLoss(_WeightedLoss):
- r"""Creates a criterion that optimizes a multi-class classification hinge
- loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) and
- output :math:`y` (which is a 1D tensor of target class indices,
- :math:`0 \leq y \leq \text{x.size}(1)-1`):
- For each mini-batch sample, the loss in terms of the 1D input :math:`x` and scalar
- output :math:`y` is:
- .. math::
- \text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)}
- where :math:`i \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`
- and :math:`i \neq y`.
- Optionally, you can give non-equal weighting on the classes by passing
- a 1D :attr:`weight` tensor into the constructor.
- The loss function then becomes:
- .. math::
- \text{loss}(x, y) = \frac{\sum_i \max(0, w[y] * (\text{margin} - x[y] + x[i]))^p}{\text{x.size}(0)}
- Args:
- p (int, optional): Has a default value of :math:`1`. :math:`1` and :math:`2`
- are the only supported values.
- margin (float, optional): Has a default value of :math:`1`.
- weight (Tensor, optional): a manual rescaling weight given to each
- class. If given, it has to be a Tensor of size `C`. Otherwise, it is
- treated as if having all ones.
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input: :math:`(N, C)` or :math:`(C)`, where :math:`N` is the batch size and :math:`C` is the number of classes.
- - Target: :math:`(N)` or :math:`()`, where each value is :math:`0 \leq \text{targets}[i] \leq C-1`.
- - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the target.
- Examples::
- >>> loss = nn.MultiMarginLoss()
- >>> x = torch.tensor([[0.1, 0.2, 0.4, 0.8]])
- >>> y = torch.tensor([3])
- >>> # 0.25 * ((1-(0.8-0.1)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
- >>> loss(x, y)
- tensor(0.32...)
- """
- __constants__ = ['p', 'margin', 'reduction']
- margin: float
- p: int
- def __init__(self, p: int = 1, margin: float = 1., weight: Optional[Tensor] = None, size_average=None,
- reduce=None, reduction: str = 'mean') -> None:
- super().__init__(weight, size_average, reduce, reduction)
- if p != 1 and p != 2:
- raise ValueError("only p == 1 and p == 2 supported")
- assert weight is None or weight.dim() == 1
- self.p = p
- self.margin = margin
- def forward(self, input: Tensor, target: Tensor) -> Tensor:
- return F.multi_margin_loss(input, target, p=self.p, margin=self.margin,
- weight=self.weight, reduction=self.reduction)
- class TripletMarginLoss(_Loss):
- r"""Creates a criterion that measures the triplet loss given an input
- tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
- This is used for measuring a relative similarity between samples. A triplet
- is composed by `a`, `p` and `n` (i.e., `anchor`, `positive examples` and `negative
- examples` respectively). The shapes of all input tensors should be
- :math:`(N, D)`.
- The distance swap is described in detail in the paper `Learning shallow
- convolutional feature descriptors with triplet losses`_ by
- V. Balntas, E. Riba et al.
- The loss function for each sample in the mini-batch is:
- .. math::
- L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
- where
- .. math::
- d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
- See also :class:`~torch.nn.TripletMarginWithDistanceLoss`, which computes the
- triplet margin loss for input tensors using a custom distance function.
- Args:
- margin (float, optional): Default: :math:`1`.
- p (int, optional): The norm degree for pairwise distance. Default: :math:`2`.
- swap (bool, optional): The distance swap is described in detail in the paper
- `Learning shallow convolutional feature descriptors with triplet losses` by
- V. Balntas, E. Riba et al. Default: ``False``.
- size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
- the losses are averaged over each loss element in the batch. Note that for
- some losses, there are multiple elements per sample. If the field :attr:`size_average`
- is set to ``False``, the losses are instead summed for each minibatch. Ignored
- when :attr:`reduce` is ``False``. Default: ``True``
- reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
- losses are averaged or summed over observations for each minibatch depending
- on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
- batch element instead and ignores :attr:`size_average`. Default: ``True``
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
- and :attr:`reduce` are in the process of being deprecated, and in the meantime,
- specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
- Shape:
- - Input: :math:`(N, D)` or :math:`(D)` where :math:`D` is the vector dimension.
- - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'`` and
- input shape is :math:`(N, D)`; a scalar otherwise.
- Examples::
- >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
- >>> anchor = torch.randn(100, 128, requires_grad=True)
- >>> positive = torch.randn(100, 128, requires_grad=True)
- >>> negative = torch.randn(100, 128, requires_grad=True)
- >>> output = triplet_loss(anchor, positive, negative)
- >>> output.backward()
- .. _Learning shallow convolutional feature descriptors with triplet losses:
- http://www.bmva.org/bmvc/2016/papers/paper119/index.html
- """
- __constants__ = ['margin', 'p', 'eps', 'swap', 'reduction']
- margin: float
- p: float
- eps: float
- swap: bool
- def __init__(self, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap: bool = False, size_average=None,
- reduce=None, reduction: str = 'mean'):
- super().__init__(size_average, reduce, reduction)
- self.margin = margin
- self.p = p
- self.eps = eps
- self.swap = swap
- def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
- return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
- eps=self.eps, swap=self.swap, reduction=self.reduction)
- class TripletMarginWithDistanceLoss(_Loss):
- r"""Creates a criterion that measures the triplet loss given input
- tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor,
- positive, and negative examples, respectively), and a nonnegative,
- real-valued function ("distance function") used to compute the relationship
- between the anchor and positive example ("positive distance") and the
- anchor and negative example ("negative distance").
- The unreduced loss (i.e., with :attr:`reduction` set to ``'none'``)
- can be described as:
- .. math::
- \ell(a, p, n) = L = \{l_1,\dots,l_N\}^\top, \quad
- l_i = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
- where :math:`N` is the batch size; :math:`d` is a nonnegative, real-valued function
- quantifying the closeness of two tensors, referred to as the :attr:`distance_function`;
- and :math:`margin` is a nonnegative margin representing the minimum difference
- between the positive and negative distances that is required for the loss to
- be 0. The input tensors have :math:`N` elements each and can be of any shape
- that the distance function can handle.
- If :attr:`reduction` is not ``'none'``
- (default ``'mean'``), then:
- .. math::
- \ell(x, y) =
- \begin{cases}
- \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
- \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.}
- \end{cases}
- See also :class:`~torch.nn.TripletMarginLoss`, which computes the triplet
- loss for input tensors using the :math:`l_p` distance as the distance function.
- Args:
- distance_function (Callable, optional): A nonnegative, real-valued function that
- quantifies the closeness of two tensors. If not specified,
- `nn.PairwiseDistance` will be used. Default: ``None``
- margin (float, optional): A nonnegative margin representing the minimum difference
- between the positive and negative distances required for the loss to be 0. Larger
- margins penalize cases where the negative examples are not distant enough from the
- anchors, relative to the positives. Default: :math:`1`.
- swap (bool, optional): Whether to use the distance swap described in the paper
- `Learning shallow convolutional feature descriptors with triplet losses` by
- V. Balntas, E. Riba et al. If True, and if the positive example is closer to the
- negative example than the anchor is, swaps the positive example and the anchor in
- the loss computation. Default: ``False``.
- reduction (str, optional): Specifies the (optional) reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the sum of the output will be divided by the number of
- elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
- Shape:
- - Input: :math:`(N, *)` where :math:`*` represents any number of additional dimensions
- as supported by the distance function.
- - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar
- otherwise.
- Examples::
- >>> # Initialize embeddings
- >>> embedding = nn.Embedding(1000, 128)
- >>> anchor_ids = torch.randint(0, 1000, (1,))
- >>> positive_ids = torch.randint(0, 1000, (1,))
- >>> negative_ids = torch.randint(0, 1000, (1,))
- >>> anchor = embedding(anchor_ids)
- >>> positive = embedding(positive_ids)
- >>> negative = embedding(negative_ids)
- >>>
- >>> # Built-in Distance Function
- >>> triplet_loss = \
- >>> nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance())
- >>> output = triplet_loss(anchor, positive, negative)
- >>> output.backward()
- >>>
- >>> # Custom Distance Function
- >>> def l_infinity(x1, x2):
- >>> return torch.max(torch.abs(x1 - x2), dim=1).values
- >>>
- >>> # xdoctest: +SKIP("FIXME: Would call backwards a second time")
- >>> triplet_loss = (
- >>> nn.TripletMarginWithDistanceLoss(distance_function=l_infinity, margin=1.5))
- >>> output = triplet_loss(anchor, positive, negative)
- >>> output.backward()
- >>>
- >>> # Custom Distance Function (Lambda)
- >>> triplet_loss = (
- >>> nn.TripletMarginWithDistanceLoss(
- >>> distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y)))
- >>> output = triplet_loss(anchor, positive, negative)
- >>> output.backward()
- Reference:
- V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses:
- http://www.bmva.org/bmvc/2016/papers/paper119/index.html
- """
- __constants__ = ['margin', 'swap', 'reduction']
- margin: float
- swap: bool
- def __init__(self, *, distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
- margin: float = 1.0, swap: bool = False, reduction: str = 'mean'):
- super().__init__(size_average=None, reduce=None, reduction=reduction)
- self.distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = \
- distance_function if distance_function is not None else PairwiseDistance()
- self.margin = margin
- self.swap = swap
- def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
- return F.triplet_margin_with_distance_loss(anchor, positive, negative,
- distance_function=self.distance_function,
- margin=self.margin, swap=self.swap, reduction=self.reduction)
- class CTCLoss(_Loss):
- r"""The Connectionist Temporal Classification loss.
- Calculates loss between a continuous (unsegmented) time series and a target sequence. CTCLoss sums over the
- probability of possible alignments of input to target, producing a loss value which is differentiable
- with respect to each input node. The alignment of input to target is assumed to be "many-to-one", which
- limits the length of the target sequence such that it must be :math:`\leq` the input length.
- Args:
- blank (int, optional): blank label. Default :math:`0`.
- reduction (str, optional): Specifies the reduction to apply to the output:
- ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
- ``'mean'``: the output losses will be divided by the target lengths and
- then the mean over the batch is taken. Default: ``'mean'``
- zero_infinity (bool, optional):
- Whether to zero infinite losses and the associated gradients.
- Default: ``False``
- Infinite losses mainly occur when the inputs are too short
- to be aligned to the targets.
- Shape:
- - Log_probs: Tensor of size :math:`(T, N, C)` or :math:`(T, C)`,
- where :math:`T = \text{input length}`,
- :math:`N = \text{batch size}`, and
- :math:`C = \text{number of classes (including blank)}`.
- The logarithmized probabilities of the outputs (e.g. obtained with
- :func:`torch.nn.functional.log_softmax`).
- - Targets: Tensor of size :math:`(N, S)` or
- :math:`(\operatorname{sum}(\text{target\_lengths}))`,
- where :math:`N = \text{batch size}` and
- :math:`S = \text{max target length, if shape is } (N, S)`.
- It represent the target sequences. Each element in the target
- sequence is a class index. And the target index cannot be blank (default=0).
- In the :math:`(N, S)` form, targets are padded to the
- length of the longest sequence, and stacked.
- In the :math:`(\operatorname{sum}(\text{target\_lengths}))` form,
- the targets are assumed to be un-padded and
- concatenated within 1 dimension.
- - Input_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`,
- where :math:`N = \text{batch size}`. It represent the lengths of the
- inputs (must each be :math:`\leq T`). And the lengths are specified
- for each sequence to achieve masking under the assumption that sequences
- are padded to equal lengths.
- - Target_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`,
- where :math:`N = \text{batch size}`. It represent lengths of the targets.
- Lengths are specified for each sequence to achieve masking under the
- assumption that sequences are padded to equal lengths. If target shape is
- :math:`(N,S)`, target_lengths are effectively the stop index
- :math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for
- each target in a batch. Lengths must each be :math:`\leq S`
- If the targets are given as a 1d tensor that is the concatenation of individual
- targets, the target_lengths must add up to the total length of the tensor.
- - Output: scalar. If :attr:`reduction` is ``'none'``, then
- :math:`(N)` if input is batched or :math:`()` if input is unbatched, where :math:`N = \text{batch size}`.
- Examples::
- >>> # Target are to be padded
- >>> T = 50 # Input sequence length
- >>> C = 20 # Number of classes (including blank)
- >>> N = 16 # Batch size
- >>> S = 30 # Target sequence length of longest target in batch (padding length)
- >>> S_min = 10 # Minimum target length, for demonstration purposes
- >>>
- >>> # Initialize random batch of input vectors, for *size = (T,N,C)
- >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
- >>>
- >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
- >>> target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
- >>>
- >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
- >>> target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)
- >>> ctc_loss = nn.CTCLoss()
- >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
- >>> loss.backward()
- >>>
- >>>
- >>> # Target are to be un-padded
- >>> T = 50 # Input sequence length
- >>> C = 20 # Number of classes (including blank)
- >>> N = 16 # Batch size
- >>>
- >>> # Initialize random batch of input vectors, for *size = (T,N,C)
- >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
- >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
- >>>
- >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
- >>> target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
- >>> target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.long)
- >>> ctc_loss = nn.CTCLoss()
- >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
- >>> loss.backward()
- >>>
- >>>
- >>> # Target are to be un-padded and unbatched (effectively N=1)
- >>> T = 50 # Input sequence length
- >>> C = 20 # Number of classes (including blank)
- >>>
- >>> # Initialize random batch of input vectors, for *size = (T,C)
- >>> # xdoctest: +SKIP("FIXME: error in doctest")
- >>> input = torch.randn(T, C).log_softmax(2).detach().requires_grad_()
- >>> input_lengths = torch.tensor(T, dtype=torch.long)
- >>>
- >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
- >>> target_lengths = torch.randint(low=1, high=T, size=(), dtype=torch.long)
- >>> target = torch.randint(low=1, high=C, size=(target_lengths,), dtype=torch.long)
- >>> ctc_loss = nn.CTCLoss()
- >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
- >>> loss.backward()
- Reference:
- A. Graves et al.: Connectionist Temporal Classification:
- Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
- https://www.cs.toronto.edu/~graves/icml_2006.pdf
- Note:
- In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
- in concatenated format, all :attr:`input_lengths` must be `T`. :math:`blank=0`,
- :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
- dtype :attr:`torch.int32`.
- The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
- Note:
- In some circumstances when using the CUDA backend with CuDNN, this operator
- may select a nondeterministic algorithm to increase performance. If this is
- undesirable, you can try to make the operation deterministic (potentially at
- a performance cost) by setting ``torch.backends.cudnn.deterministic =
- True``.
- Please see the notes on :doc:`/notes/randomness` for background.
- """
- __constants__ = ['blank', 'reduction']
- blank: int
- zero_infinity: bool
- def __init__(self, blank: int = 0, reduction: str = 'mean', zero_infinity: bool = False):
- super().__init__(reduction=reduction)
- self.blank = blank
- self.zero_infinity = zero_infinity
- def forward(self, log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor) -> Tensor:
- return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction,
- self.zero_infinity)
- # TODO: L1HingeEmbeddingCriterion
- # TODO: MSECriterion weight
- # TODO: ClassSimplexCriterion
|