# See README.md in this directory for more guidance

# *********NB: _cast_* operators are DEPRECATED and will be removed
# eventually. These were previously used before TorchScript IR supported
# representing ScalarType's. They are now superseded by usage of
# `aten::to()`. The ops remain here for backward compatibility purposes.

# DEPRECATED. DO NOT USE
- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

# DEPRECATED. DO NOT USE
- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

# DEPRECATED. DO NOT USE
- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

# DEPRECATED. DO NOT USE
- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

# DEPRECATED. DO NOT USE
- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

# DEPRECATED. DO NOT USE
- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

# DEPRECATED. DO NOT USE
- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

# DEPRECATED. DO NOT USE
- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

# Computes the gradient of current tensor w.r.t. graph leaves.
- func: backward(Tensor self, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
  manual_kernel_registration: True
  variants: method

# DEPRECATED. Sets the tensor data held by this `Variable` to be the same as
# `new_data`.  It requires that `new_data` and `Variable` have compatible tensor
# type, by checking `_has_compatible_shallow_copy_type(this, new_data)`.
#
# This function is deprecated because it doesn't really make sense in a world
# where Variables *are* Tensors (as opposed to them containing tensors, which
# is what the previous interpretation was.)
- func: set_data(Tensor(a!) self, Tensor new_data) -> ()
  use_c10_dispatcher: full
  manual_kernel_registration: True
  variants: method

- func: data(Tensor self) -> Tensor
  use_c10_dispatcher: full
  manual_kernel_registration: True
  variants: method

# True if this `Variable` is a leaf and thus does not have a `grad_fn`.
- func: is_leaf(Tensor self) -> bool
  use_c10_dispatcher: full
  manual_kernel_registration: True
  variants: method

# Returns the output index of this variable from the forward operation that
# produced it.  Conversely, it returns the input index of the gradient `Node` to
# which this `Variable` is connected (because in the gradient computation,
# inputs and outputs switch meaning).  For example:
#
#   y0, y1, y2 = f(x)
#   assert y0.output_nr == 0
#   assert y1.output_nr == 1
#   assert y2.output_nr == 2
#
- func: output_nr(Tensor self) -> int
  use_c10_dispatcher: full
  manual_kernel_registration: True
  variants: method

- func: _version(Tensor self) -> int
  use_c10_dispatcher: full
  manual_kernel_registration: True
  variants: method

- func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!)
  manual_kernel_registration: True
  variants: method

# Enables .grad attribute for non-leaf Tensors.
- func: retain_grad(Tensor(a!) self) -> ()
  use_c10_dispatcher: full
  manual_kernel_registration: True
  variants: method

- func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
  variants: method

- func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
  variants: method

- func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a)
  variants: method

- func: align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a)
  variants: method

- func: align_as(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method

- func: align_tensors(Tensor[] tensors) -> Tensor[]
  use_c10_dispatcher: full

- func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
  variants: method

- func: unflatten.Dimname(Tensor self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor
  variants: method

- func: unflatten.int(Tensor self, int dim, int[] sizes, Dimname[] names) -> Tensor
  variants: method

- func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
  use_c10_dispatcher: full
  dispatch:
    CUDA: _use_cudnn_ctc_loss

- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CUDA: _cudnn_ctc_loss

- func: _use_cudnn_rnn_flatten_weight() -> bool
  use_c10_dispatcher: full

- func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, int input_size, int mode, int hidden_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: _cudnn_rnn_flatten_weight

- func: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
  dispatch:
    CUDA: _cudnn_rnn

- func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
  dispatch:
    CUDA: _cudnn_rnn_backward

- func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
  dispatch:
    CUDA: _cudnn_init_dropout_state

- func: _debug_has_internal_overlap(Tensor self) -> int
  use_c10_dispatcher: full
  variants: function

- func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
  variants: function
  dispatch:
     CUDA: fused_dropout_cuda

- func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
     CUDA: masked_scale_cuda

- func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)

- func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)

- func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!)

- func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!)

- func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor
  use_c10_dispatcher: full

- func: _shape_as_tensor(Tensor self) -> Tensor
  use_c10_dispatcher: full

- func: dropout(Tensor input, float p, bool train) -> Tensor
  use_c10_dispatcher: full

- func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)

- func: feature_dropout(Tensor input, float p, bool train) -> Tensor
  use_c10_dispatcher: full

- func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)

- func: alpha_dropout(Tensor input, float p, bool train) -> Tensor
  use_c10_dispatcher: full

- func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)

- func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor
  use_c10_dispatcher: full

- func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)

- func: abs(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: abs_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: absolute(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: abs
    CUDA: abs

- func: absolute_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method
  dispatch:
    CPU: abs_
    CUDA: abs_

- func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: abs_out
    CUDA: abs_out

- func: angle(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: view_as_real(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function

- func: view_as_complex(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function

- func: real(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function

- func: imag(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function

- func: conj(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: acos(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: acos_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
  use_c10_dispatcher: full

- func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor
  use_c10_dispatcher: full

# Return: (Tensor output, Tensor indices)
- func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: add
    CUDA: add
    SparseCPU: add_sparse
    SparseCUDA: add_sparse
    MkldnnCPU: mkldnn_add
    Vulkan: vulkan_add

- func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: add_
    CUDA: add_
    SparseCPU: add_sparse_
    SparseCUDA: add_sparse_
    MkldnnCPU: mkldnn_add_

- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: add_out
    CUDA: add_out
    SparseCPU: add_out_sparse_cpu
    SparseCUDA: add_out_sparse_cuda
    MkldnnCPU: mkldnn_add_out

# For C++ only, until we have conversion from C++ numbers to Tensor
- func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
  variants: method

- func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
  variants: function, method

- func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)

- func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
  dispatch:
    CPU: addmv_impl_cpu
    CUDA: addmv_impl_cuda

- func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
  variants: method

- func: addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)

- func: affine_grid_generator(Tensor theta, int[] size, bool align_corners) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: affine_grid_generator_backward(Tensor grad, int[] size, bool align_corners) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
  variants: function, method

- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
  use_c10_dispatcher: full
  variants: function, method

- func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
  variants: function, method

- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)

- func: arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: arange_cpu_out
    CUDA: arange_cuda_out

# This function is a temporary hack to allow tracing of arange like constructs with dynamic
# bounds on arange.  Normal arange is not traceable because it does not take any tensor inputs;
# if the range you need is based on another tensor, calling this function directly will
# preserve tracing.  Get rid of this when arange can directly take tensors for bounds
# (so that it can be traced directly).
- func: _dim_arange(Tensor like, int dim) -> Tensor
  use_c10_dispatcher: full

- func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: argmax
    CUDA: argmax

- func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: argmin
    CUDA: argmin

- func: acosh(Tensor self) -> Tensor
  use_c10_dispatcher: full
  supports_named_tensor: True
  variants: function, method

- func: acosh_(Tensor(a!) self) -> Tensor(a!)
  supports_named_tensor: True
  variants: function, method

- func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  supports_named_tensor: True

- func: asinh(Tensor self) -> Tensor
  use_c10_dispatcher: full
  supports_named_tensor: True
  variants: function, method

- func: asinh_(Tensor(a!) self) -> Tensor(a!)
  supports_named_tensor: True
  variants: function, method

- func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  supports_named_tensor: True

- func: atanh(Tensor self) -> Tensor
  use_c10_dispatcher: full
  supports_named_tensor: True
  variants: function, method

- func: atanh_(Tensor(a!) self) -> Tensor(a!)
  supports_named_tensor: True
  variants: function, method

- func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  supports_named_tensor: True

- func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: as_strided_tensorimpl
    CUDA: as_strided_tensorimpl
    QuantizedCPU: as_strided_qtensorimpl
    QuantizedCUDA: as_strided_qtensorimpl
  device_guard: False

- func: as_strided_(Tensor(a!) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a!)
  variants: function, method
  device_guard: False

- func: asin(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: asin_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: atan(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: atan_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: baddbmm_cpu
    CUDA: baddbmm_cuda

- func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: baddbmm__cpu
    CUDA: baddbmm__cuda

- func: _baddbmm_mkl_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
  variants: function

- func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
    CPU: baddbmm_out_cpu
    CUDA: baddbmm_out_cuda

- func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor

- func: quantized_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor
  requires_tensor: True
  dispatch:
    QuantizedCPU: quantized_batch_norm

- func: _batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int)

- func: _batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor)

# Sample bernoulli with values in `self` as probability.
- func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
  variants: function, method

- func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
  variants: function

- func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
  variants: method

- func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
  variants: method

# This out-of-place version isn't used explicitly, but needed by jit.
# There is no default valid on `p` here because it would introduce ambiguity
# with `bernoulli(Tensor self, *, Generator? generator=None)` declaration.
- func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor
  variants: function, method

- func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor

- func: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
  python_module: nn
  variants: function
  dispatch:
    CPU: binary_cross_entropy_cpu
    CUDA: binary_cross_entropy_cuda

- func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  variants: function
  dispatch:
    CPU: binary_cross_entropy_out_cpu
    CUDA: binary_cross_entropy_out_cuda

- func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
  python_module: nn
  variants: function
  dispatch:
    CPU: binary_cross_entropy_backward_cpu
    CUDA: binary_cross_entropy_backward_cuda

- func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  variants: function
  dispatch:
    CPU: binary_cross_entropy_backward_out_cpu
    CUDA: binary_cross_entropy_backward_out_cuda

- func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
  variants: function

- func: binary_cross_entropy_with_logits_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
  variants: function

- func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor
  variants: function, method
  dispatch:
    CPU: _bincount_cpu
    CUDA: _bincount_cuda

- func: bitwise_not(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: bitwise_not_(Tensor(a!) self) -> Tensor(a!)
  variants: method

- func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: bitwise_not_out
    CUDA: bitwise_not_out

- func: logical_not(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: logical_not_(Tensor(a!) self) -> Tensor(a!)
  variants: method

- func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: logical_not_out
    CUDA: logical_not_out

- func: logical_xor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: logical_xor_out
    CUDA: logical_xor_out

- func: logical_and(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: logical_and_out
    CUDA: logical_and_out

- func: logical_or(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: logical_or_out
    CUDA: logical_or_out

- func: blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: bmm(Tensor self, Tensor mat2) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: bmm_cpu
    CUDA: bmm_cuda
    SparseCPU: bmm_sparse_cpu
    SparseCUDA: bmm_sparse_cuda

- func: _bmm(Tensor self, Tensor mat2, *, bool deterministic=False) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
    SparseCUDA: _bmm_sparse_cuda

- func: bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
    CPU: bmm_out_cpu
    CUDA: bmm_out_cuda
    SparseCPU: bmm_out_sparse_cpu
    SparseCUDA: bmm_out_sparse_cuda

- func: _bmm.out(Tensor self, Tensor mat2, *, bool deterministic=False, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
    SparseCUDA: _bmm_out_sparse_cuda

- func: broadcast_tensors(Tensor[] tensors) -> Tensor[]
  use_c10_dispatcher: full
  device_guard: False

- func: cat(Tensor[] tensors, int dim=0) -> Tensor
  use_c10_dispatcher: full

- func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)

- func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor

- func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)

- func: block_diag(Tensor[] tensors) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: ceil(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: ceil_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: ceil_out
    CUDA: ceil_out

- func: chain_matmul(Tensor[] matrices) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: chunk(Tensor(a) self, int chunks, int dim=0) -> Tensor(a)[]
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: clamp
    CUDA: clamp
    QuantizedCPU: quantized_clamp
    Vulkan: vulkan_clamp

- func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
  variants: function, method

- func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)

- func: clamp_max(Tensor self, Scalar max) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
  variants: function, method

- func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)

- func: clamp_min(Tensor self, Scalar min) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
  variants: function, method

- func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)

- func: cudnn_is_acceptable(Tensor self) -> bool
  use_c10_dispatcher: full
  device_guard: False

- func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: contiguous(Tensor self, *, MemoryFormat memory_format=contiguous_format) -> Tensor
  variants: method

- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor

- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor

- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
  use_c10_dispatcher: full

- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor

- func: _convolution_nogroup(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding) -> Tensor

- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool[3] output_mask) -> (Tensor, Tensor, Tensor)

- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor

- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor

- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor

- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
  use_c10_dispatcher: full

- func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full

# NB: we inherit the goofy argument order from PyTorch torch.nn.functional
- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor

- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor

- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor

- func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
  manual_kernel_registration: True
  variants: method
  device_guard: False

- func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
  use_c10_dispatcher: full
  dispatch: {}

- func: cos(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: cos_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: cosh(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: cosh_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
  use_c10_dispatcher: full

- func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_affine_grid_generator_forward

# TODO: Why do I have to call this grad?!
- func: cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_affine_grid_generator_backward

- func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor)
  dispatch:
    CUDA: cudnn_batch_norm

# NB: You can only use this if you used cudnn_batch_norm training=True
- func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: cudnn_batch_norm_backward

- func: cudnn_convolution.deprecated(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: cudnn_convolution_deprecated

- func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_convolution

- func: cudnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_convolution_backward_input

- func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[2] output_mask) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_convolution_backward

- func: cudnn_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_convolution_backward_weight

- func: cudnn_convolution_transpose.deprecated(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: cudnn_convolution_transpose_deprecated

- func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_convolution_transpose

# NB: output_padding not strictly needed here, but it's helpful for the float
# backwards
- func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[2] output_mask) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_convolution_transpose_backward

- func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_convolution_transpose_backward_input

- func: cudnn_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_convolution_transpose_backward_weight

# NB: input is special cased in a way I don't quite understand
- func: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_grid_sampler_forward

- func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid)
  use_c10_dispatcher: full
  dispatch:
    CUDA: cudnn_grid_sampler_backward

- func: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices)
  use_c10_dispatcher: full
  variants: function, method

- func: cummax.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
  variants: function, method

- func: cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: _cummax_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
  variants: function
  dispatch:
    CPU: cummax_helper_cpu
    CUDA: cummax_helper_cuda

- func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
  use_c10_dispatcher: full
  variants: function, method

- func: cummin.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
  variants: function, method

- func: cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: _cummin_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
  variants: function
  dispatch:
    CPU: cummin_helper_cpu
    CUDA: cummin_helper_cuda

- func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)

- func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)

- func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)

- func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)

- func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
  use_c10_dispatcher: full

# convenience function that converts to intlists for you
- func: ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
  use_c10_dispatcher: full

- func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CPU:  ctc_loss_cpu
    CUDA: ctc_loss_gpu

- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: ctc_loss_backward_cpu
    CUDA: ctc_loss_backward_gpu

- func: det(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: diagflat(Tensor self, int offset=0) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function, method

- func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a)
  variants: function, method

- func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)
  variants: method

- func: div.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: div
    CUDA: div
    SparseCPU: div_sparse
    SparseCUDA: div_sparse

- func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: div_
    CUDA: div_
    SparseCPU: div_sparse_
    SparseCUDA: div_sparse_

- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: div_out
    CUDA: div_out
    SparseCPU: div_out_sparse_zerodim
    SparseCUDA: div_out_sparse_zerodim

# For C++ only, until we have conversion from C++ numbers to Tensor
- func: div.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: dot(Tensor self, Tensor tensor) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: legacy::cpu::_th_dot
    CUDA: legacy::cuda::_th_dot

- func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)

- func: einsum(str equation, Tensor[] tensors) -> Tensor
  use_c10_dispatcher: full

- func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
  use_c10_dispatcher: full

- func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
  use_c10_dispatcher: full

- func: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: embedding_dense_backward_cpu
    CUDA: embedding_dense_backward_cuda

- func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
  dispatch:
    CPU: embedding_renorm_cpu_
    CUDA: embedding_renorm_cuda_

- func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
  use_c10_dispatcher: full

# NOTE [ embedding_bag Native Functions ]
# The `_embedding_bag.*` variants assume that input tensors except for `weight`,
# e.g. `indices` and `offsets` (and `offset2bag`), are contiguous.
# We really only need to enforce this for `_embedding_bag` (the forward) because
# the backward inputs are the same as forward ones.
# The above `embedding_bag` wrapper is created to achieve this, e.g.,
# applying indices = indices.contiguous().
# The backward functions apply a check that these input tensors are contiguous.

- func: embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)

- func: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)
  dispatch:
    CPU: _embedding_bag_cpu
    CUDA: _embedding_bag_cuda

- func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, int num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights) -> Tensor

- func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, int num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights) -> Tensor

- func: _embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, int num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights) -> Tensor
  dispatch:
    CPU: _embedding_bag_dense_backward_cpu
    CUDA: _embedding_bag_dense_backward_cuda

- func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: _embedding_bag_per_sample_weights_backward_cpu
    CUDA: _embedding_bag_per_sample_weights_backward_cuda

- func: empty_meta(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor

- func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
  device_guard: False

- func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
  dispatch:
    CPU: empty_cpu
    CUDA: empty_cuda
    MkldnnCPU: empty_mkldnn
    SparseCPU: empty_sparse
    SparseCUDA: empty_sparse
    Vulkan: empty_vulkan

- func: new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  variants: method

- func: new_full(Tensor self, int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  variants: method

- func: new_zeros(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  variants: method

# other overrides are to provide a more helpful error message that dtype is required
- func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
  dispatch:
    CPU: empty_affine_quantized_other_backends_stub
    QuantizedCPU: empty_affine_quantized
    QuantizedCUDA: empty_affine_quantized

# it's a factory function receiving a tensor argument, thus overriding explicitly
# other overrides are to provide a more helpful error message that dtype is required
- func: _empty_per_channel_affine_quantized(int[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
  category_override: factory
  dispatch:
    CPU: empty_per_channel_affine_quantized_other_backends_stub
    QuantizedCPU: empty_per_channel_affine_quantized_cpu

- func: resize_(Tensor(a!) self, int[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
  manual_kernel_registration: True
  variants: method
  device_guard: False

- func: empty_quantized(int[] size, Tensor qtensor) -> Tensor
  variants: function
  dispatch:
    QuantizedCPU: empty_quantized
    QuantizedCUDA: empty_quantized

- func: empty.out(int[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
  device_guard: False

- func: empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
  device_guard: False

- func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  dispatch:
    CPU: empty_strided_cpu
    CUDA: empty_strided_cuda
    Vulkan: empty_strided_vulkan

- func: erf(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: erf_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: erfc(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: erfc_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: exp(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: exp_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: expm1(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: expm1_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: expm1_out
    CUDA: expm1_out

- func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
  use_c10_dispatcher: full
  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
  device_guard: False

- func: expand_as(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
  device_guard: False

- func: eye(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: eye.m(int n, int m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: eye.out(int n, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: eye_out_cpu
    CUDA: eye_out_cuda

- func: eye.m_out(int n, int m, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: eye_out_cpu
    CUDA: eye_out_cuda

- func: flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: flatten.named_out_dim(Tensor self, int start_dim, int end_dim, Dimname out_dim) -> Tensor
  variants: function, method

- func: flatten.using_names(Tensor self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor
  variants: function, method

- func: flatten.DimnameList(Tensor self, Dimname[] dims, Dimname out_dim) -> Tensor
  variants: function, method

- func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
  variants: function, method

- func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
  variants: function, method

- func: floor(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: floor_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: floor_out
    CUDA: floor_out

- func: floor_divide(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: floor_divide
    CUDA: floor_divide
    SparseCPU: floor_divide_sparse
    SparseCUDA: floor_divide_sparse

- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: floor_divide_
    CUDA: floor_divide_
    SparseCPU: floor_divide_sparse_
    SparseCUDA: floor_divide_sparse_

- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: floor_divide_out
    CUDA: floor_divide_out
    SparseCPU: floor_divide_out_sparse_zerodim
    SparseCUDA: floor_divide_out_sparse_zerodim

- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: frac(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: frac_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  device_guard: False

- func: full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: full.out(int[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)

- func: full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor

- func: from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  dispatch:
    CPU: from_file

# NOTE [ grid_sampler Native Functions ]
# `grid_sampler` does all the shape checking and then dispatches to one of
# `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which
# has the corresponding backward defined as native functions as well. Therefore,
# in these functions and their backwards, no more shape checking is done.
#
# Additionally, arguments `padding_mode` and `interpolation_mode` are cast to
# enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in
# `interpolation_mode` because it only supports Bilinear interpolation mode.
# Nor does it take in `align_corners` because it only supports the mode
# `align_corners = True`.
- func: grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
  use_c10_dispatcher: full

- func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: grid_sampler_2d_cpu
    CUDA: grid_sampler_2d_cuda

- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CPU: grid_sampler_2d_backward_cpu
    CUDA: grid_sampler_2d_backward_cuda

- func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: grid_sampler_3d_cpu
    CUDA: grid_sampler_3d_cuda

- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CPU: grid_sampler_3d_backward_cpu
    CUDA: grid_sampler_3d_backward_cuda

- func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
  use_c10_dispatcher: full

- func: ger(Tensor self, Tensor vec2) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)

- func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor

- func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, int N, int C, int HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
  dispatch:
    CPU: native_group_norm
    CUDA: native_group_norm

- func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int N, int C, int HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  dispatch:
    CPU: native_group_norm_backward
    CUDA: native_group_norm_backward

# FFT

- func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: ifft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: rfft(Tensor self, int signal_ndim, bool normalized=False, bool onesided=True) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: irfft(Tensor self, int signal_ndim, bool normalized=False, bool onesided=True, int[] signal_sizes=[]) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: _fft_with_size(Tensor self, int signal_ndim, bool complex_input, bool complex_output, bool inverse, int[] checked_signal_sizes, bool normalized, bool onesided, int[] output_sizes) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _fft_mkl
    CUDA: _fft_cufft

- func: _cufft_get_plan_cache_size(int device_index) -> int
  use_c10_dispatcher: full

- func: _cufft_get_plan_cache_max_size(int device_index) -> int
  use_c10_dispatcher: full

- func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> ()
  use_c10_dispatcher: full

- func: _cufft_clear_plan_cache(int device_index) -> ()
  use_c10_dispatcher: full

- func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
  variants: function, method
  # NB: This function is special-cased in tools/autograd/gen_variable_type.py
  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
  # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
  # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices)

- func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
  variants: method

- func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)
  variants: method

- func: index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor
  variants: function, method

- func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)
  variants: function, method
  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
  # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Tensor const & rhs)
  # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Scalar v)
  # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Tensor const & rhs)
  # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Scalar v)

- func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
  variants: function, method

- func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
  variants: function

- func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
  variants: function

- func: inverse(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: _inverse_helper(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _inverse_helper_cpu
    CUDA: _inverse_helper_cuda

- func: isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: isnan(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False
  dispatch:
    CPU: isnan
    CUDA: isnan
    SparseCPU: isnan_sparse
    SparseCUDA: isnan_sparse

- func: is_distributed(Tensor self) -> bool
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: is_floating_point(Tensor self) -> bool
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: is_complex(Tensor self) -> bool
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: is_nonzero(Tensor self) -> bool
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: is_same_size(Tensor self, Tensor other) -> bool
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: is_signed(Tensor self) -> bool
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor
  use_c10_dispatcher: full

- func: kl_div_backward(Tensor grad_output, Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: kl_div_backward_cpu
    CUDA: kl_div_backward_cuda

- func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
  use_c10_dispatcher: full
  variants: function, method

- func: kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
  dispatch:
    CPU: kthvalue_out_cpu
    CUDA: kthvalue_out_cuda

- func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  variants: function, method

- func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor

- func: native_layer_norm(Tensor input, Tensor? weight, Tensor? bias, int M, int N, float eps) -> (Tensor, Tensor, Tensor)
  dispatch:
    CPU: layer_norm_cpu
    CUDA: layer_norm_cuda

- func: native_layer_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int M, int N, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  dispatch:
    CPU: layer_norm_backward_cpu
    CUDA: layer_norm_backward_cuda

- func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
  python_module: nn

- func: mkldnn_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
  python_module: nn
  dispatch:
    MkldnnCPU: mkldnn_linear

- func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
  use_c10_dispatcher: full

- func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
  use_c10_dispatcher: full

- func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int)
  use_c10_dispatcher: full

- func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor
  use_c10_dispatcher: full

- func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
  use_c10_dispatcher: full

- func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
  use_c10_dispatcher: full

- func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor
  use_c10_dispatcher: full

- func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
  use_c10_dispatcher: full

- func: linspace(Scalar start, Scalar end, int steps=100, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: linspace.out(Scalar start, Scalar end, int steps=100, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: linspace_cpu_out
    CUDA: linspace_cuda_out

- func: log(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: log_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: log_out
    CUDA: log_out

- func: log10(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: log10_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: log10_out
    CUDA: log10_out

- func: log1p(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: log1p_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method
  dispatch:
    CPU: log1p_
    CUDA: log1p_
    SparseCPU: log1p_sparse_
    SparseCUDA: log1p_sparse_

- func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: log1p_out
    CUDA: log1p_out
    SparseCPU: log1p_out_sparse
    SparseCUDA: log1p_out_sparse

- func: log2(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: log2_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: log2_out
    CUDA: log2_out

- func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)

- func: logaddexp(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)

- func: logaddexp2(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: logdet(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: logspace(Scalar start, Scalar end, int steps=100, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: logspace.out(Scalar start, Scalar end, int steps=100, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: logspace_cpu_out
    CUDA: logspace_cuda_out

# log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
- func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: log_softmax_cpu
    CUDA: log_softmax_cuda

- func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: log_softmax_backward_cpu
    CUDA: log_softmax_backward_cuda

- func: _logcumsumexp(Tensor self, int dim) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: _logcumsumexp_cpu
    CUDA: _logcumsumexp_cuda

- func: _logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: _logcumsumexp_out_cpu
    CUDA: _logcumsumexp_out_cuda

- func: logcumsumexp(Tensor self, int dim) -> Tensor
  variants: function, method

- func: logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)

- func: logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor
  variants: function, method

- func: logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)

- func: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
  variants: function, method

- func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
  use_c10_dispatcher: full

- func: matmul(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)

- func: matrix_rank.tol(Tensor self, float tol, bool symmetric=False) -> Tensor
  use_c10_dispatcher: full

- func: matrix_rank(Tensor self, bool symmetric=False) -> Tensor
  use_c10_dispatcher: full

- func: matrix_power(Tensor self, int n) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  use_c10_dispatcher: full
  variants: function, method

- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)

- func: max_values(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  variants: function, method

- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)

- func: max_values.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
  variants: function, method

# Return: (Tensor output, Tensor indices)
- func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
  use_c10_dispatcher: full

- func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  use_c10_dispatcher: full

- func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  use_c10_dispatcher: full
  requires_tensor: True
  dispatch:
    MkldnnCPU: mkldnn_max_pool2d

- func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  use_c10_dispatcher: full
  requires_tensor: True
  dispatch:
    QuantizedCPU: quantized_max_pool2d

- func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
  use_c10_dispatcher: full

# The CPU and GPU dispatch variants are named weirdly here because otherwise there
# are namespacing issues in C++
- func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
  variants: function, method
  dispatch:
    CPU: mean_cpu_gpu
    CUDA: mean_cpu_gpu
    QuantizedCPU: quantized_mean_cpu

- func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  variants: function, method
  dispatch:
    CPU: mean_cpu_gpu
    CUDA: mean_cpu_gpu
    QuantizedCPU: quantized_mean_cpu
    Vulkan: mean_vulkan

- func: mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: mean_out_cpu_gpu
    CUDA: mean_out_cpu_gpu
    QuantizedCPU: quantized_mean_out_cpu

- func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)

- func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  use_c10_dispatcher: full
  variants: function, method

- func: median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  variants: function, method

- func: median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  use_c10_dispatcher: full
  variants: function, method

- func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: min_values(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  variants: function, method

- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: min_values.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
  variants: function, method

- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor

- func: mkldnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> Tensor
  use_c10_dispatcher: full

- func: mkldnn_convolution_backward_weights(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full

- func: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: miopen_batch_norm

- func: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: miopen_batch_norm_backward

- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: miopen_convolution

- func: miopen_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: miopen_convolution_backward_input

- func: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CUDA: miopen_convolution_backward

- func: miopen_convolution_backward_bias(Tensor grad_output) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: miopen_convolution_backward_bias

- func: miopen_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: miopen_convolution_backward_weight

- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: miopen_convolution_transpose

# NB: output_padding not strictly needed here, but it's helpful for the float
# backwards
- func: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CUDA: miopen_convolution_transpose_backward

- func: miopen_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: miopen_convolution_transpose_backward_input

- func: miopen_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: miopen_convolution_transpose_backward_weight

- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  dispatch:
    CUDA: miopen_depthwise_convolution

- func: miopen_depthwise_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: miopen_depthwise_convolution_backward_input

- func: miopen_depthwise_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CUDA: miopen_depthwise_convolution_backward

- func: miopen_depthwise_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CUDA: miopen_depthwise_convolution_backward_weight

- func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
  dispatch:
    CUDA: miopen_rnn

- func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
  dispatch:
    CUDA: miopen_rnn_backward

- func: mm(Tensor self, Tensor mat2) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: mm_cpu
    CUDA: mm_cuda
    SparseCPU: _sparse_mm
    SparseCUDA: _sparse_mm

- func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: mm_cpu_out
    CUDA: mm_out_cuda
    SparseCPU: _sparse_mm_out
    SparseCUDA: _sparse_mm_out

- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
  use_c10_dispatcher: full

- func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
  use_c10_dispatcher: full
  variants: function, method

- func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  variants: function, method

- func: mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: mul.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: mul
    CUDA: mul
    SparseCPU: mul_sparse
    SparseCUDA: mul_sparse
    MkldnnCPU: mkldnn_mul

- func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: mul_
    CUDA: mul_
    SparseCPU: mul_sparse_
    SparseCUDA: mul_sparse_
    MkldnnCPU: mkldnn_mul_

- func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: mul_out
    CUDA: mul_out
    SparseCPU: mul_out_sparse_cpu
    SparseCUDA: mul_out_sparse_cuda
    MkldnnCPU: mkldnn_mul_out

  # For C++ only, until we have conversion from C++ numbers to Tensor
- func: mul.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: mv(Tensor self, Tensor vec) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: mv
    CUDA: mv
    SparseCPU: mv_sparse
    SparseCUDA: mv_sparse

- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)

- func: mvlgamma(Tensor self, int p) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!)
  variants: method

- func: narrow_copy(Tensor self, int dim, int start, int length) -> Tensor
  use_c10_dispatcher: full
  variants: method
  dispatch:
    CPU: narrow_copy_dense
    CUDA: narrow_copy_dense
    SparseCPU: narrow_copy_sparse
    SparseCUDA: narrow_copy_sparse

- func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
  dispatch:
    CPU: batch_norm_cpu
    CUDA: batch_norm_cuda
    MkldnnCPU: mkldnn_batch_norm

- func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
  dispatch:
    CUDA: batch_norm_cuda_out

- func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CUDA: batch_norm_stats_cuda

- func: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
  dispatch:
    CUDA: batch_norm_elemt_cuda

- func: batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CUDA: batch_norm_elemt_cuda_out

# for backward compatibility
- func: batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor)
  dispatch:
    CUDA: batch_norm_gather_stats_cuda

- func: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)
  dispatch:
    CUDA: batch_norm_gather_stats_with_counts_cuda

- func: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  dispatch:
    CPU: batch_norm_backward_cpu
    CUDA: batch_norm_backward_cuda

- func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
  dispatch:
    CUDA: batch_norm_backward_reduce_cuda

- func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor mean_dy, Tensor mean_dy_xmu) -> Tensor
  dispatch:
    CUDA: batch_norm_backward_elemt_cuda

- func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor)
  dispatch:
    CPU: batch_norm_update_stats_cpu
    CUDA: batch_norm_update_stats_cuda

- func: is_vulkan_available() -> bool
  use_c10_dispatcher: full

- func: _nnpack_available() -> bool
  use_c10_dispatcher: full

- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
  variants: function

- func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function

- func: _nnpack_spatial_convolution_backward_input(Tensor input, Tensor grad_output, Tensor weight, int[2] padding) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: _nnpack_spatial_convolution_backward_weight(Tensor input, int[] weightsize, Tensor grad_output, int[2] padding) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  device_guard: False

- func: ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: ones.out(int[] size, *, Tensor(a!) out) -> Tensor(a!)

- func: ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor

- func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full

- func: cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor
  use_c10_dispatcher: full

- func: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor
  use_c10_dispatcher: full

- func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
  use_c10_dispatcher: full

- func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
  use_c10_dispatcher: full

- func: pdist(Tensor self, float p=2) -> Tensor
  use_c10_dispatcher: full

- func: _pdist_forward(Tensor self, float p=2) -> Tensor
  use_c10_dispatcher: full

- func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
  use_c10_dispatcher: full

- func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: permute(Tensor(a) self, int[] dims) -> Tensor(a)
  use_c10_dispatcher: full
  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.

# Only exposed from C++ -- in Python,
# we expose it as an attribute `T`, not a function.
#
# I'd like to name this "T" in C++ too, but
# calling a native function "T" causes undefined
# behavior on Windows, for reasons I don't understand
# (maybe related to capital letter collation somehow...)
- func: numpy_T(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: method

- func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
  use_c10_dispatcher: full

- func: channel_shuffle(Tensor self, int groups) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: channel_shuffle
    QuantizedCPU: quantized_channel_shuffle

- func: is_pinned(Tensor self) -> bool
  use_c10_dispatcher: full
  variants: method

- func: pin_memory(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method

- func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: rad2deg(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  supports_named_tensor: True

- func: rad2deg_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method
  supports_named_tensor: True

- func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  supports_named_tensor: True

- func: deg2rad(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  supports_named_tensor: True

- func: deg2rad_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method
  supports_named_tensor: True

- func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  supports_named_tensor: True

- func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: rand.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  device_guard: False

- func: rand.generator_with_names(int[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  device_guard: False

- func: rand(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: rand.generator(int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: rand.out(int[] size, *, Tensor(a!) out) -> Tensor(a!)

- func: rand.generator_out(int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)

- func: rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor

- func: randint(int high, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: randint.generator(int high, int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: randint.low(int low, int high, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: randint.low_generator(int low, int high, int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: randint.out(int high, int[] size, *, Tensor(a!) out) -> Tensor(a!)

- func: randint.generator_out(int high, int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)

- func: randint.low_out(int low, int high, int[] size, *, Tensor(a!) out) -> Tensor(a!)

- func: randint.low_generator_out(int low, int high, int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)

- func: randint_like(Tensor self, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor

- func: randint_like.low_dtype(Tensor self, int low, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor

- func: randn(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: randn.generator(int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: randn.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  device_guard: False

- func: randn.generator_with_names(int[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  device_guard: False

- func: randn.out(int[] size, *, Tensor(a!) out) -> Tensor(a!)

- func: randn.generator_out(int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)

- func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor

- func: randperm(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: randperm.generator(int n, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: randperm.out(int n, *, Tensor(a!) out) -> Tensor(a!)

- func: randperm.generator_out(int n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: randperm_out_cpu
    CUDA: randperm_out_cuda

- func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: range_cpu_out
    CUDA: range_cuda_out

- func: reciprocal(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: reciprocal_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: neg(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: neg_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: neg_out
    CUDA: neg_out

- func: repeat(Tensor self, int[] repeats) -> Tensor
  use_c10_dispatcher: full
  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.

- func: repeat_interleave.Tensor(Tensor repeats) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: repeat_interleave_cpu
    CUDA: repeat_interleave_cuda

- func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: reshape(Tensor self, int[] shape) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
  use_c10_dispatcher: full
  device_guard: False
  requires_tensor: True
  dispatch:
    MkldnnCPU: mkldnn_reshape

- func: reshape_as(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method
  device_guard: False

- func: round(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: round_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: round_out
    CUDA: round_out

- func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor

- func: rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)

- func: relu(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: relu
    CUDA: relu
    MkldnnCPU: mkldnn_relu
    QuantizedCPU: quantized_relu

- func: relu_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method
  dispatch:
    CPU: relu_
    CUDA: relu_
    MkldnnCPU: mkldnn_relu_
    QuantizedCPU: quantized_relu_

- func: prelu(Tensor self, Tensor weight) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: prelu_cpu
    CUDA: prelu_cuda

- func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: prelu_backward_cpu
    CUDA: prelu_backward_cuda

- func: gelu(Tensor self) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: gelu_cpu
    CUDA: gelu_cuda

- func: gelu_backward(Tensor grad, Tensor self) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: gelu_backward_cpu
    CUDA: gelu_backward_cuda

- func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: rsqrt(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: rsqrt_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: rsqrt_out
    CUDA: rsqrt_out

- func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
  variants: function, method
  device_guard: False

- func: select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: selu(Tensor self) -> Tensor
  use_c10_dispatcher: full

- func: selu_(Tensor(a!) self) -> Tensor(a!)

- func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
  use_c10_dispatcher: full

- func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)

- func: sigmoid(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: sigmoid
    CUDA: sigmoid
    QuantizedCPU: quantized_sigmoid
    MkldnnCPU: mkldnn_sigmoid

- func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method
  dispatch:
    CPU: sigmoid_
    CUDA: sigmoid_
    MkldnnCPU: mkldnn_sigmoid_

- func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: sin(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: sin_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: sin_out
    CUDA: sin_out

- func: sinh(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: sinh_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

# Returns a copy of this `Variable` that is detached from its autograd graph.
# This method is OK to call if the `Variable` is a view.
#
# NOTE: Previously, if we change the tensor metadata (e.g. sizes / strides /
# storage / storage_offset) of a tensor created from `detach()`, those metadata
# in the original tensor will also be updated. However, the new behavior is that
# those metadata changes to the detached tensor will not update the original tensor
# anymore, and in the `detach()` function we need to set `allow_tensor_metadata_change_`
# to false to make such changes explicitly illegal, in order to prevent users from
# changing metadata of the detached tensor and expecting the original tensor to also
# be updated.
- func: detach(Tensor self) -> Tensor
  use_c10_dispatcher: full
  manual_kernel_registration: True
  variants: function, method

# Like `detach()`, but modifies this `Variable` in-place. This method may
# only be called on non-view `Variable`s. You can use `is_view()` to check
# this. If this `Variable` is a view, throws an `std::runtime_error()`.
- func: detach_(Tensor(a!) self) -> Tensor(a!)
  manual_kernel_registration: True
  variants: function, method

- func: size.int(Tensor self, int dim) -> int
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: size.Dimname(Tensor self, Dimname dim) -> int
  variants: function, method
  device_guard: False

- func: slice.Tensor(Tensor(a) self, int dim=0, int start=0, int end=9223372036854775807, int step=1) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
  use_c10_dispatcher: full
  variants: function, method

- func: smm(Tensor self, Tensor mat2) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

# softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
- func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: softmax_cpu
    CUDA: softmax_cuda
    MkldnnCPU: mkldnn_softmax

- func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: softmax_backward_cpu
    CUDA: softmax_backward_cuda

- func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: squeeze(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
  variants: function, method
  device_guard: False

- func: squeeze_(Tensor(a!) self) -> Tensor(a!)
  variants: method
  device_guard: False

- func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)
  variants: method
  device_guard: False

- func: squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)
  variants: method
  device_guard: False

- func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: _sspaddmm_out_only_sparse
    CUDA: _sspaddmm_out_only_sparse_cuda
    SparseCPU: _sspaddmm_out_cpu
    SparseCUDA: _sspaddmm_out_cuda

- func: stack(Tensor[] tensors, int dim=0) -> Tensor
  use_c10_dispatcher: full

- func: stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)

# The signature is designed to be consistent with librosa except that it is
# missing the `pad_mode` and `center` arguments, which are taken care of at
# `torch.functional.py`. They shall be moved here once we have mapping between
# Python strings and C++ Enum in codegen.
- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool onesided=True) -> Tensor
  variants: function, method

- func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool onesided=True, int? length=None) -> Tensor
  variants: function, method

- func: stride.int(Tensor self, int dim) -> int
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: stride.Dimname(Tensor self, Dimname dim) -> int
  variants: function, method
  device_guard: False

- func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)

- func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)

- func: sum_to_size(Tensor self, int[] size) -> Tensor
  use_c10_dispatcher: full
  variants: method
  device_guard: False

- func: sqrt(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: sqrt_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: square(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: square_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: std(Tensor self, bool unbiased=True) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function

- func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function

- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
  variants: function

- func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
  variants: function, method

- func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)

- func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  variants: function, method

- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)

- func: t(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  device_guard: False
  variants: function, method

- func: t_(Tensor(a!) self) -> Tensor(a!)
  device_guard: False
  variants: method

- func: tan(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: tan_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: tanh(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: tanh
    CUDA: tanh
    QuantizedCPU: quantized_tanh

- func: tanh_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
  use_c10_dispatcher: full
  variants: function

# TODO: namespace threshold in 'nn'
- func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: threshold
    CUDA: threshold_cuda
    QuantizedCPU: quantized_threshold

- func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
  variants: function
  dispatch:
    CPU: threshold_
    CUDA: threshold__cuda

- func: threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: threshold_out
    CUDA: threshold_out_cuda

- func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: threshold_backward
    CUDA: threshold_backward_cuda

- func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
  variants: function, method
  device_guard: False

- func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor
  use_c10_dispatcher: full
  device_guard: False
  requires_tensor: True
  dispatch:
    MkldnnCPU: mkldnn_transpose

- func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
  variants: method
  device_guard: False

- func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
  device_guard: False
  requires_tensor: True
  dispatch:
    MkldnnCPU: mkldnn_transpose_

- func: one_hot(Tensor self, int num_classes=-1) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  variants: function

- func: flip(Tensor self, int[] dims) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: flip_cpu
    CUDA: flip_cuda

- func: fliplr(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: flipud(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: roll_cpu
    CUDA: roll_cuda

# default int[] value [0,1] should not add space after comma, since native_parse.py uses ', ' to split args

- func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
  use_c10_dispatcher: full

- func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor
  use_c10_dispatcher: full

- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
  use_c10_dispatcher: full

- func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor
  use_c10_dispatcher: full

- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: true_divide
    CUDA: true_divide
    SparseCPU: true_divide_sparse
    SparseCUDA: true_divide_sparse

- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: true_divide_
    CUDA: true_divide_
    SparseCPU: true_divide_sparse_
    SparseCUDA: true_divide_sparse_

- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: true_divide_out
    CUDA: true_divide_out
    SparseCPU: true_divide_out_sparse_zerodim
    SparseCUDA: true_divide_out_sparse_zerodim

- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: trunc(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: trunc_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method

- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: trunc_out
    CUDA: trunc_out

- func: type_as(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method

- func: _has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool
  use_c10_dispatcher: full
  variants: function

- func: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _unique_cpu
    CUDA: _unique_cuda

- func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: unique_dim_cpu
    CUDA: unique_dim_cuda

- func: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: unique_consecutive_cpu
    CUDA: unique_consecutive_cuda

- func: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: unique_dim_consecutive_cpu
    CUDA: unique_dim_consecutive_cuda

# _unique and _unique_dim are fragile and modifying them easily cause internal break
# the below operator is a temporary hack for adding return_counts support
# Please don't rely on these two operators, they will be removed soon

- func: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _unique2_cpu
    CUDA: _unique2_cuda

- func: _unsafe_view(Tensor self, int[] size) -> Tensor
  use_c10_dispatcher: full

- func: unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
  variants: method
  device_guard: False

- func: vander(Tensor x, int? N=None, bool increasing=False) -> Tensor
  use_c10_dispatcher: full

- func: var(Tensor self, bool unbiased=True) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
  variants: function, method

- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function

- func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function

- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
  variants: function

- func: view_as(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method
  device_guard: False

# we define both of these because 'where' does the broadcast and '_s_where' doesn't;
# this allows us to implicitly calculate the broadcast derivative, while only dealing with the
# _s_where derivative.
- func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: where(Tensor condition) -> Tensor[]
  use_c10_dispatcher: full
  variants: function

- func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
  use_c10_dispatcher: full
  variants: function

# VariableType::_weight_norm does not want to be given a gap in the autograd graph,
# so we don't define "dispatch" variants for it.
- func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CUDA: weight_norm_cuda

- func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CUDA: weight_norm_cuda_backward

- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function

- func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  device_guard: False

- func: zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: zeros.out(int[] size, *, Tensor(a!) out) -> Tensor(a!)

- func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor

- func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _standard_gamma_grad_cpu
    CUDA: _standard_gamma_grad_cuda

- func: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor
  variants: function
  dispatch:
    CPU: _s_gamma_cpu
    CUDA: _s_gamma_cuda

- func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: _dirichlet_grad_cpu
    CUDA: _dirichlet_grad_cuda

- func: _sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor
  variants: function
  dispatch:
    CPU: _s_dirichlet_cpu
    CUDA: _s_dirichlet_cuda

- func: poisson(Tensor self, Generator? generator=None) -> Tensor
  dispatch:
    CPU: _s_poisson_cpu
    CUDA: _s_poisson_cuda

- func: binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor
  dispatch:
    CPU: _s_binomial_cpu
    CUDA: _s_binomial_cuda

# When more variants get ported to native, this dispatch will get more
# complicated

- func: native_norm(Tensor self, Scalar p=2) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    SparseCPU: norm_sparse
    SparseCUDA: norm_sparse

# TODO: reduce signatures down to one when optional args is available
- func: _sparse_sum(Tensor self) -> Tensor
  use_c10_dispatcher: full

- func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor

- func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
  use_c10_dispatcher: full

- func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor

- func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
  use_c10_dispatcher: full
  dispatch:
      SparseCPU: _sparse_sum_backward_cpu
      SparseCUDA: _sparse_sum_backward_cuda

- func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
  variants: function

- func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
  variants: function

- func: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    SparseCPU: softmax_sparse_cpu

- func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
  dispatch:
    SparseCPU: softmax_backward_sparse_cpu

- func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
  variants: function

- func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
  variants: function

- func: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    SparseCPU: log_softmax_sparse_cpu

- func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
  dispatch:
    SparseCPU: log_softmax_backward_sparse_cpu

- func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)

- func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
  variants: function, method

- func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
  variants: function, method

- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)

- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

- func: frobenius_norm(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  variants: function

- func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  variants: function

- func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  variants: function

- func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
  variants: function, method
  dispatch:
    CPU: clone
    CUDA: clone
    SparseCPU: clone_sparse
    SparseCUDA: clone_sparse
    MkldnnCPU: mkldnn_clone
    QuantizedCPU: quantized_clone
    QuantizedCUDA: quantized_clone

- func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!)
  manual_kernel_registration: True
  variants: function, method

- func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: pow_out
    CUDA: pow_out
    SparseCPU: pow_out_sparse_scalar
    SparseCUDA: pow_out_sparse_scalar

- func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: pow
    CUDA: pow
    SparseCPU: pow_sparse_scalar
    SparseCUDA: pow_sparse_scalar

- func: zero_(Tensor(a!) self) -> Tensor(a!)
  variants: method, function
  dispatch:
    CPU: zero_
    CUDA: zero_
    SparseCPU: zero_sparse_
    SparseCUDA: zero_sparse_
    MkldnnCPU: mkldnn_zero_

- func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: sub_out
    CUDA: sub_out
    SparseCPU: sub_out_sparse
    SparseCUDA: sub_out_sparse

- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: sub
    CUDA: sub
    SparseCPU: sub_sparse
    SparseCUDA: sub_sparse

- func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: sub_
    CUDA: sub_
    SparseCPU: sub_sparse_
    SparseCUDA: sub_sparse_

# For C++ only, until we have conversion from C++ numbers to Tensor
- func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
  variants: method

- func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function

# For C++ only, until we have conversion from C++ numbers to Tensor
- func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function

# Functionally the same as addmm, but we give it a different derivative formula
# that doesn't propagate gradients to non-present entries on sparse.
- func: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full

- func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: addmm_cpu_out
    CUDA: addmm_out_cuda
    SparseCPU: addmm_out_sparse_dense_cpu
    SparseCUDA: addmm_out_sparse_dense_cuda

- func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    CPU: addmm_cpu
    CUDA: addmm_cuda
    SparseCPU: addmm_sparse_dense_cpu
    SparseCUDA: addmm_sparse_dense_cuda
    Vulkan: vulkan_addmm

- func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: legacy::cpu::_th_addmm_
    CUDA: addmm__cuda
    # Warning!  For whatever reason, the inplace sparse addmm is NON
    # broadcasting
    SparseCPU: s_addmm_sparse_dense_cpu_
    SparseCUDA: s_addmm_sparse_dense_cuda_

# NOTE [ Sparse: autograd and API ]
#
#
# Sparse Tensor Constructors
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# The API entry points to sparse tensor construction should be
# `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`. Depending on whether the
# indices and values tensors are given, they eventually dispatch to either
# `sparse_coo_tensor_with_dims` or `sparse_coo_tensor_with_dims_and_tensors`.
#
# The autograd support for ctor is implement on `sparse_coo_tensor_with_dims_and_tensors`.
#
# The API methods `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`
# **must not** have specific type dispatches because otherwise codegen will
# consider them as abstract methods (see Note [Abstract ATen methods]), dispatch
# using **Tensor** type, and thus lose autograd tracking on the actual method
# they dispatch to, e.g., `sparse_coo_tensor_with_dims_and_tensors`.
#
# The actual ctors `sparse_coo_tensor_with_dims` and `sparse_coo_tensor_with_dims_and_tensors`,
# on the other hand, need to create `SparseTensorImpl` and know nothing about
# how `VariableType`s work. So they need to be dispatched using Tensor types.
# We thus put `requires_tensor=True` to ensure that `VariableType` will unwrap
# the given variables and call with the Tensor type.
#
#
# Sparse Methods API Design
# ~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Goals: 1. Flexible API for users to write custom sparse ops
#        2. ctor and member accessor with autograd support
#
# To achieve 1, we need to provide a set of *dangerous* APIs (dangerous in the
# sense that misusing them will break sparse tensor invariant and may out in
# unexpected behavior, e.g., crash). These methods are all prefixed with
# underscore "_" to indicate that they should be used with care. We provide:
#
#   + `_indices()`: returns the *raw* indices within the sparse tensor (not just
#                   sharing storage). Any inplace operation will change the
#                   actual indices, including t_, set_, as_strided_, resize_,
#                   etc.
#   + `_values()`: returns the *raw* values within the sparse tensor. Similar
#                  semantics as `_indices()`
#   + `_nnz()`: returns the number of non-zero entries. This will always be
#               determined by the shapes of indices and values.
#   + `_coalesced_(bool)`: inplace sets whether the tensor is coalesced, and
#                          returns itself.
#
# These methods are very useful in writing new operations, e.g., a custom
# autograd Function.
#
# We also provide other public *safe* APIs:
#   + `indices()`: returns a **view** of the indices tensor if the sparse tensor
#                  is **coalesced**.
#   + `values()`: returns a **view** of the values tensor if the containing
#                 sparse tensor is **coalesced**.
#   + `sparse_dim()`: number of sparse dimensions
#   + `dense_dim()`: number of dense dimensions
#   + `is_coalesced()`: whether the sparse tensor is coalesced
#
# `_indices()` and `_values()` should returns the raw indices and values dense
# tensors within a sparse tensor. They can be quite unsafe with inplace
# operations like `t_()`, and exposes uncoalesced indices and values. The public
# recommended API is `indices()` and `values()`, both of which first check that
# the tensor is coalesced and return views on those tensors.
#
#
# Autograd Support
# ~~~~~~~~~~~~~~~~
#
# Autograd is supported on `values()` and sparse tensor ctor with indices and
# values tensors. E.g., `torch.sparse_coo_tensor(i, v).values().sum()` is
# differentiable w.r.t. `v`.
#
# NB: The `values()` and `_values()` operators are special in that they are
# layout-aware, i.e., the output depends not just on the data it represents, but
# also on the input layout details (in this case, the `indices` tensor). See
# NOTE [ as_strided Backward and layout-aware/agnostic autograd ] in Functions.cpp
# for discussion on layout-aware vs layout-agnostic autograd. Since PyTorch ops
# operate in the layout-agnostic mode, similar to `as_strided`, backward of
# these two operators need to consider them in a layout-agnostic way:
#   + `values()`:
#     Input is coalesced.
#     We just pretend having `input.indices()` as an additional argument
#     `input_indices`, then forward is similar to
#     `input.to(kStrided).index_select(input_indices)` regardless of the layout.
#     Note that `values()` normally is layout-aware even if we constrain
#     ourselves on sparse inputs since it may include all zeros values entries
#     as "present" entries.
#   + `_values()`:
#     Input may be uncoalesced.
#     It is not straightforward to construct a layout-agnostic version because
#     duplicate indices entries may exist and additional parameterization is
#     needed to distribute the value into different values entries. Furthermore,
#     this op is intended to provide ways to write custom sparse ops, rather
#     than being used in autograd graph, so it is marked as *non-differentiable*
#     in derivatives.yaml.
#
# Before reading the following, see NOTE [ Autograd Variable Views ] in
# variable.h for details on views that are tracked by autograd, and views that
# are not.
#
# Moreover, these methods return tensors that share storage with inputs, so we
# mark these methods as view ops to support autograd history tracking.
# The sparse tensor ctor output should technically be view of both input indices
# and values tensors, but currently we only support setting as view of a single
# Variable, so it is only view of the values tensor.
# TODO: clone indices in sparse tensor ctor.
#
# For other methods that return outputs that share storage with inputs, i.e.,
# `indices()` and `_indices()`. We mark their outputs as non-differentiable, so
# the view relation is not tracked by autograd, but the version counter is still
# shared. In other words, their outputs are non-differentiable views of the
# sparse tensor.

# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
# the default would never make sense.
- func: sparse_coo_tensor.size(int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor

- func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
  dispatch:
    SparseCPU: new_with_dims_sparse
    SparseCUDA: new_with_dims_sparse
  requires_tensor: True

- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, int[] size, Tensor indices, Tensor values, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
  dispatch:
    SparseCPU: new_with_dims_and_tensor_sparse
    SparseCUDA: new_with_dims_and_tensor_sparse
  requires_tensor: True

- func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
  variants: method
  dispatch:
    SparseCPU: sparse_resize_
    SparseCUDA: sparse_resize_
  requires_tensor: True

- func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
  variants: method
  dispatch:
    SparseCPU: sparse_resize_and_clear_
    SparseCUDA: sparse_resize_and_clear_
  requires_tensor: True

- func: sparse_mask(Tensor self, Tensor mask) -> Tensor
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: sparse_mask_cpu
    SparseCUDA: sparse_mask_cuda
  requires_tensor: True

- func: to_dense(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: sparse_to_dense
    SparseCUDA: sparse_to_dense
    MkldnnCPU: mkldnn_to_dense
  requires_tensor: True

- func: to_dense_backward(Tensor grad, Tensor input) -> Tensor
  use_c10_dispatcher: full

- func: sparse_dim(Tensor self) -> int
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: sparse_dim_sparse
    SparseCUDA: sparse_dim_sparse
  requires_tensor: True
  device_guard: False

# legacy method
- func: _dimI(Tensor self) -> int
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: sparse_dim_sparse
    SparseCUDA: sparse_dim_sparse
  requires_tensor: True
  device_guard: False

- func: dense_dim(Tensor self) -> int
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: dense_dim_sparse
    SparseCUDA: dense_dim_sparse
  requires_tensor: True
  device_guard: False

# legacy method
- func: _dimV(Tensor self) -> int
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: dense_dim_sparse
    SparseCUDA: dense_dim_sparse
  requires_tensor: True
  device_guard: False

- func: _nnz(Tensor self) -> int
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: _nnz_sparse
    SparseCUDA: _nnz_sparse
  requires_tensor: True
  device_guard: False

- func: coalesce(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: coalesce_sparse_cpu
    SparseCUDA: coalesce_sparse_cuda
  requires_tensor: True

- func: is_coalesced(Tensor self) -> bool
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: is_coalesced_sparse
    SparseCUDA: is_coalesced_sparse
  requires_tensor: True
  device_guard: False

- func: _indices(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: _indices_sparse
    SparseCUDA: _indices_sparse
  requires_tensor: True
  device_guard: False

- func: _values(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: _values_sparse
    SparseCUDA: _values_sparse
  requires_tensor: True
  device_guard: False

# This method doesn't do any check but only directly sets the flag. So it can be
# a bit unsafe. Similar to _indices and _values, this is useful for implementing
# custom sparse operations in Python/C++ extension.
- func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)
  variants: method
  dispatch:
    SparseCPU: _coalesced_sparse_
    SparseCUDA: _coalesced_sparse_
  requires_tensor: True
  device_guard: False

- func: indices(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: indices_sparse
    SparseCUDA: indices_sparse
  requires_tensor: True
  device_guard: False

- func: values(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: method
  dispatch:
    SparseCPU: values_sparse
    SparseCUDA: values_sparse
  requires_tensor: True
  device_guard: False

- func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    SparseCPU: hspmm_out_sparse_cpu
    SparseCUDA: hspmm_out_sparse_cuda
  requires_tensor: True

- func: hspmm(Tensor mat1, Tensor mat2) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    SparseCPU: hspmm_sparse_cpu
    SparseCUDA: hspmm_sparse_cuda
  requires_tensor: True

- func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
  variants: function
  dispatch:
    SparseCPU: copy_sparse_
    SparseCUDA: copy_sparse_
  requires_tensor: True

- func: unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[]
  use_c10_dispatcher: full
  variants: function, method

- func: unbind.Dimname(Tensor(a) self, Dimname dim) -> Tensor(a)[]
  variants: function, method

- func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
  use_c10_dispatcher: full
  variants: method
  dispatch:
    CPU: dense_to_sparse
    CUDA: dense_to_sparse

- func: to_sparse(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method
  dispatch:
    CPU: dense_to_sparse
    CUDA: dense_to_sparse

- func: to_mkldnn(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method
  dispatch:
    CPU: dense_to_mkldnn

- func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1) -> Tensor
  use_c10_dispatcher: full
  variants: function
  python_module: nn
  dispatch:
    MkldnnCPU: mkldnn_reorder_conv2d_weight

- func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor
  use_c10_dispatcher: full

- func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor
  variants: function
  dispatch:
    CPU: quantize_per_tensor
    CUDA: quantize_per_tensor

- func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[]
  variants: function
  dispatch:
    CPU: quantize_per_tensor_list_cpu

- func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor
  variants: function
  dispatch:
    CPU: quantize_per_channel_cpu

- func: dequantize.self(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    QuantizedCPU: dequantize_quant
    QuantizedCUDA: dequantize_quant

- func: dequantize.tensors(Tensor[] tensors) -> Tensor[]
  use_c10_dispatcher: full
  variants: function
  dispatch:
    QuantizedCPU: dequantize_tensors_quant

- func: q_scale(Tensor self) -> float
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    QuantizedCPU: q_scale_quant
    QuantizedCUDA: q_scale_quant

- func: q_zero_point(Tensor self) -> int
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    QuantizedCPU: q_zero_point_quant
    QuantizedCUDA: q_zero_point_quant

- func: q_per_channel_scales(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    QuantizedCPU: q_per_channel_scales_quant

- func: q_per_channel_zero_points(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    QuantizedCPU: q_per_channel_zero_points_quant

- func: q_per_channel_axis(Tensor self) -> int
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    QuantizedCPU: q_per_channel_axis_quant

- func: int_repr(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  dispatch:
    QuantizedCPU: int_repr_quant_cpu
    QuantizedCUDA: int_repr_quant_cuda

- func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: make_per_tensor_quantized_tensor_cpu
    CUDA: make_per_tensor_quantized_tensor_cuda

- func: _make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: make_per_channel_quantized_tensor_cpu

- func: qscheme(Tensor self) -> QScheme
  use_c10_dispatcher: full
  variants: method
  dispatch:
    QuantizedCPU: qscheme_quant
    QuantizedCUDA: qscheme_quant

- func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)
  use_c10_dispatcher: full
  variants: function

# to(Device) must not exist because all constructors of Device also works for
# TensorOptions. Otherwise, an ambiguity error is thrown.
# See NOTE [ TensorOptions Constructors ].
- func: to.dtype_layout(Tensor self, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
  variants: method
  device_guard: False

- func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
  variants: method
  device_guard: False

- func: to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
  variants: method
  device_guard: False

- func: to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
  variants: method
  device_guard: False

- func: meshgrid(Tensor[] tensors) -> Tensor[]
  use_c10_dispatcher: full

- func: cartesian_prod(Tensor[] tensors) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor
  use_c10_dispatcher: full
  variants: function

- func: item(Tensor self) -> Scalar
  use_c10_dispatcher: full
  variants: method

- func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType
  variants: function

- func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType
  variants: function

- func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType
  variants: function

- func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType

- func: can_cast(ScalarType from, ScalarType to) -> bool
  variants: function

- func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
  variants: function

# NB: Does NOT check precondition that numel == 1
- func: _local_scalar_dense(Tensor self) -> Scalar
  use_c10_dispatcher: full
  dispatch:
    CPU: _local_scalar_dense_cpu
    CUDA: _local_scalar_dense_cuda
  variants: function

# Fused RNN kernels
- func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: _thnn_fused_lstm_cell_cuda

- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
  dispatch:
    CUDA: _thnn_fused_lstm_cell_backward_cuda

- func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor)

- func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)
  dispatch:
    CUDA: _thnn_fused_gru_cell_cuda

- func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CUDA: _thnn_fused_gru_cell_backward_cuda

- func: _thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)

# RNN cells and layers
- func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full

- func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full

- func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)

- func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor

- func: rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor

- func: rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor

# Quantized RNN layer registration has been moved to C10 dispatch in `RNN.cpp`

# Quantized RNN layers
# - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)

# - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)

# Quantized GRU layers

# - func: quantized_gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
#   use_c10_dispatcher: full

# - func: quantized_gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
#   use_c10_dispatcher: full

# Quantized RNN cells
- func: quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
  use_c10_dispatcher: full

- func: quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
  use_c10_dispatcher: full

- func: quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
  use_c10_dispatcher: full

# PackedSequence utilities
- func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
  use_c10_dispatcher: full

- func: _pack_padded_sequence_backward(Tensor grad, int[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor
  use_c10_dispatcher: full

- func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor)
  use_c10_dispatcher: full

# wrappers for legacy TH methods

- func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)
  variants: method
  device_guard: False
  dispatch:
    CPU: set_
    CUDA: set_

- func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
  variants: method
  device_guard: False
  dispatch:
    CPU: set_storage_cpu_
    CUDA: set_storage_cuda_
    QuantizedCPU: set_storage_quantized_
    QuantizedCUDA: set_storage_quantized_

- func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
  variants: method
  device_guard: False
  dispatch:
    CPU: set_tensor_
    CUDA: set_tensor_

- func: set_(Tensor(a!) self) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: set_cpu_
    CUDA: set_cuda_

- func: set_quantizer_(Tensor(a!) self, ConstQuantizerPtr quantizer) -> Tensor(a!)
  variants: method
  dispatch:
    QuantizedCPU: set_quantizer_
    QuantizedCUDA: set_quantizer_

- func: is_set_to(Tensor self, Tensor tensor) -> bool
  use_c10_dispatcher: full
  variants: method
  device_guard: False
  dispatch:
    CPU: is_set_to
    CUDA: is_set_to

- func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: masked_fill__cpu
    CUDA: masked_fill__cuda

- func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: masked_fill__cpu
    CUDA: masked_fill__cuda

- func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: masked_scatter__cpu
    CUDA: masked_scatter__cuda

- func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: view(Tensor(a) self, int[] size) -> Tensor(a)
  use_c10_dispatcher: full
  variants: method
  device_guard: False
  dispatch:
    CPU: view
    CUDA: view
    MkldnnCPU: mkldnn_view
    QuantizedCPU: view
    QuantizedCUDA: view

- func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: legacy::cpu::_th_put_
    CUDA: legacy::cuda::_th_put_

- func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: index_add_cpu_
    CUDA: index_add_cuda_

- func: index_add(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor
  variants: function, method

- func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: legacy::cpu::_th_index_fill_
    CUDA: legacy::cuda::_th_index_fill_

- func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: index_fill_
    CUDA: index_fill_

- func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
  variants: method

- func: index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!)
  variants: method

- func: index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
  variants: function, method

- func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor
  variants: function, method

- func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: scatter_
    CUDA: scatter_

- func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: scatter_fill_
    CUDA: scatter_fill_

- func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
  variants: function, method

- func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
  variants: function, method

- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: scatter_add_
    CUDA: scatter_add_

- func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
  variants: function, method

- func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
    CPU: bitwise_and_out
    CUDA: bitwise_and_out

- func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
    CPU: bitwise_and_out
    CUDA: bitwise_and_out

- func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: __and__.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: __and__.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
    CPU: bitwise_or_out
    CUDA: bitwise_or_out

- func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
    CPU: bitwise_or_out
    CUDA: bitwise_or_out

- func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: __or__.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: __or__.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
    CPU: bitwise_xor_out
    CUDA: bitwise_xor_out

- func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
    CPU: bitwise_xor_out
    CUDA: bitwise_xor_out

- func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: __xor__.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method

- func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: __lshift__
    CUDA: __lshift__

- func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: __lshift__
    CUDA: __lshift__

- func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: __ilshift__
    CUDA: __ilshift__

- func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: __ilshift__
    CUDA: __ilshift__

- func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: __rshift__
    CUDA: __rshift__

- func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: __rshift__
    CUDA: __rshift__

- func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: __irshift__
    CUDA: __irshift__

- func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: __irshift__
    CUDA: __irshift__

- func: lgamma_(Tensor(a!) self) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: _lgamma__cpu
    CUDA: _lgamma__cuda

- func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method

- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: tril_cpu_
    CUDA: tril_cuda_

- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: triu_cpu_
    CUDA: triu_cuda_

- func: digamma_(Tensor(a!) self) -> Tensor(a!)
  variants: method

- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
  variants: method

- func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: legacy::cpu::_th_renorm_
    CUDA: legacy::cuda::_th_renorm_

- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: pow_
    CUDA: pow_

- func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: pow_
    CUDA: pow_

- func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: lerp_cpu_scalar_
    CUDA: lerp_cuda_scalar_

- func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: lerp_cpu_tensor_
    CUDA: lerp_cuda_tensor_

- func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: fmod_
    CUDA: fmod_cuda_

- func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: fmod_
    CUDA: fmod_cuda_

- func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: remainder_
    CUDA: remainder_

- func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: remainder_
    CUDA: remainder_

- func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: legacy::cpu::_th_addbmm_
    CUDA: addbmm__cuda

- func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: addbmm_cpu_out
    CUDA: addbmm_out_cuda

- func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: addbmm_cpu
    CUDA: addbmm_cuda

- func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
  variants: method

- func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
  variants: method

- func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
  variants: method

- func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
  variants: method

- func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
  variants: method

- func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
  variants: method

- func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
  variants: method

- func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
  variants: method

- func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
  variants: method

# wrappers for TH functions

- func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: diag_cpu_out
    CUDA: diag_cuda_out

- func: diag(Tensor self, int diagonal=0) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)

- func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: triu_cpu_out
    CUDA: triu_cuda_out

- func: triu(Tensor self, int diagonal=0) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: tril_cpu_out
    CUDA: tril_cuda_out

- func: tril(Tensor self, int diagonal=0) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  dispatch:
    CPU: tril_indices_cpu
    CUDA: tril_indices_cuda

- func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  dispatch:
    CPU: triu_indices_cpu
    CUDA: triu_indices_cuda

- func: trace(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_trace
    CUDA: trace_cuda

- func: ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: ne_out
    CUDA: ne_out
    QuantizedCPU: ne_out_quantized_cpu

- func: ne.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: ne
    CUDA: ne
    QuantizedCPU: ne_quantized_cpu

- func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: ne_out
    CUDA: ne_out
    QuantizedCPU: ne_out_quantized_cpu

- func: ne.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: ne
    CUDA: ne
    QuantizedCPU: ne_quantized_cpu

- func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: eq_out
    CUDA: eq_out
    QuantizedCPU: eq_out_quantized_cpu

- func: eq.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: eq
    CUDA: eq
    QuantizedCPU: eq_quantized_cpu

- func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: eq_out
    CUDA: eq_out
    QuantizedCPU: eq_out_quantized_cpu

- func: eq.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: eq
    CUDA: eq
    QuantizedCPU: eq_quantized_cpu

- func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: ge_out
    CUDA: ge_out
    QuantizedCPU: ge_out_quantized_cpu

- func: ge.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: ge
    CUDA: ge
    QuantizedCPU: ge_quantized_cpu

- func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: ge_out
    CUDA: ge_out
    QuantizedCPU: ge_out_quantized_cpu

- func: ge.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: ge
    CUDA: ge
    QuantizedCPU: ge_quantized_cpu

- func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: le_out
    CUDA: le_out
    QuantizedCPU: le_out_quantized_cpu

- func: le.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: le
    CUDA: le
    QuantizedCPU: le_quantized_cpu

- func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: le_out
    CUDA: le_out
    QuantizedCPU: le_out_quantized_cpu

- func: le.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: le
    CUDA: le
    QuantizedCPU: le_quantized_cpu

- func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: gt_out
    CUDA: gt_out
    QuantizedCPU: gt_out_quantized_cpu

- func: gt.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: gt
    CUDA: gt
    QuantizedCPU: gt_quantized_cpu

- func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: gt_out
    CUDA: gt_out
    QuantizedCPU: gt_out_quantized_cpu

- func: gt.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: gt
    CUDA: gt
    QuantizedCPU: gt_quantized_cpu

- func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: lt_out
    CUDA: lt_out
    QuantizedCPU: lt_out_quantized_cpu

- func: lt.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: lt
    CUDA: lt
    QuantizedCPU: lt_quantized_cpu

- func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: lt_out
    CUDA: lt_out
    QuantizedCPU: lt_out_quantized_cpu

- func: lt.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: lt
    CUDA: lt
    QuantizedCPU: lt_quantized_cpu

- func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: legacy::cpu::_th_take_out
    CUDA: legacy::cuda::_th_take_out

- func: take(Tensor self, Tensor index) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_take
    CUDA: legacy::cuda::_th_take

- func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: index_select_out_cpu_
    CUDA: legacy::cuda::_th_index_select_out

- func: index_select(Tensor self, int dim, Tensor index) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: index_select_cpu_
    CUDA: legacy::cuda::_th_index_select
    SparseCPU: index_select_sparse
    SparseCUDA: index_select_sparse

- func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)

- func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor
  variants: method, function

- func: masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: masked_select_out_cpu
    CUDA: masked_select_out_cuda

- func: masked_select(Tensor self, Tensor mask) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: masked_select_cpu
    CUDA: masked_select_cuda

- func: nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: legacy::cpu::_th_nonzero_out
    CUDA: legacy::cuda::_th_nonzero_out

- func: nonzero(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_nonzero
    CUDA: legacy::cuda::_th_nonzero

- func: nonzero_numpy(Tensor self) -> Tensor[]
  use_c10_dispatcher: full
  variants: method, function

- func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: gather_out_cpu_cuda
    CUDA: gather_out_cpu_cuda

- func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: gather
    CUDA: gather

- func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)

- func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor
  variants: method, function

- func: _gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor
  use_c10_dispatcher: full

- func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)

- func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
  variants: method

- func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)

- func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
  dispatch:
    CPU: legacy::cpu::_th_gels_out
    CUDA: legacy::cuda::_th_gels_out

- func: lstsq(Tensor self, Tensor A) -> (Tensor solution, Tensor QR)
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_gels
    CUDA: legacy::cuda::_th_gels

- func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient)

- func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
  use_c10_dispatcher: full
  variants: method, function

- func: _triangular_solve_helper(Tensor self, Tensor A, bool upper, bool transpose, bool unitriangular) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _triangular_solve_helper_cpu
    CUDA: _triangular_solve_helper_cuda

- func: symeig.e(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)

- func: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors)
  use_c10_dispatcher: full
  variants: method, function

- func: _symeig_helper(Tensor self, bool eigenvectors, bool upper) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _symeig_helper_cpu
    CUDA: _symeig_helper_cuda

- func: eig.e(Tensor self, bool eigenvectors=False, *, Tensor(a!) e, Tensor(b!) v) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
  dispatch:
    CPU: legacy::cpu::_th_eig_out
    CUDA: legacy::cuda::_th_eig_out

- func: eig(Tensor self, bool eigenvectors=False) -> (Tensor eigenvalues, Tensor eigenvectors)
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_eig
    CUDA: legacy::cuda::_th_eig

- func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)

- func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)
  use_c10_dispatcher: full
  variants: method, function

- func: _svd_helper(Tensor self, bool some, bool compute_uv) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _svd_helper_cpu
    CUDA: _svd_helper_cuda

- func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)

- func: cholesky(Tensor self, bool upper=False) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: _cholesky_helper(Tensor self, bool upper) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _cholesky_helper_cpu
    CUDA: _cholesky_helper_cuda

- func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)

- func: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: _cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _cholesky_solve_helper_cpu
    CUDA: _cholesky_solve_helper_cuda

- func: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU)
  use_c10_dispatcher: full
  variants: function, method

- func: solve.solution(Tensor self, Tensor A, *, Tensor(a!) solution, Tensor(b!) lu) -> (Tensor(a!) solution, Tensor(b!) LU)

- func: _solve_helper(Tensor self, Tensor A) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _solve_helper_cpu
    CUDA: _solve_helper_cuda

- func: cholesky_inverse.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: legacy::cpu::_th_potri_out
    CUDA: legacy::cuda::_th_potri_out

- func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_potri
    CUDA: legacy::cuda::_th_potri

- func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)

- func: qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)
  use_c10_dispatcher: full
  variants: method, function

- func: _qr_helper(Tensor self, bool some) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _qr_helper_cpu
    CUDA: _qr_helper_cuda

- func: geqrf.a(Tensor self, *, Tensor(a!) a, Tensor(b!) tau) -> (Tensor(a!) a, Tensor(b!) tau)
  dispatch:
    CPU: legacy::cpu::_th_geqrf_out
    CUDA: legacy::cuda::_th_geqrf_out

- func: geqrf(Tensor self) -> (Tensor a, Tensor tau)
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_geqrf
    CUDA: legacy::cuda::_th_geqrf

- func: orgqr.out(Tensor self, Tensor input2, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: legacy::cpu::_th_orgqr_out

- func: orgqr(Tensor self, Tensor input2) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_orgqr

- func: ormqr.out(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: legacy::cpu::_th_ormqr_out

- func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_ormqr

- func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor, Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _lu_with_info_cpu
    CUDA: _lu_with_info_cuda

- func: lu_solve.out(Tensor self, Tensor LU_data, Tensor LU_pivots, *, Tensor(a!) out) -> Tensor(a!)

- func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: _lu_solve_helper(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: _lu_solve_helper_cpu
    CUDA: _lu_solve_helper_cuda

# TODO: remove dispatch section when porting TH CUDA to ATen
- func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: multinomial_out
    CUDA: multinomial_out

- func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
  variants: method, function
  dispatch:
    CPU: multinomial
    CUDA: multinomial

- func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  variants: function
  dispatch:
    CPU: legacy::cpu::_th_multinomial_alias_setup
    CUDA: legacy::cuda::_th_multinomial_alias_setup

- func: _multinomial_alias_draw(Tensor J, Tensor q, int num_samples, *, Generator? generator=None) -> Tensor
  variants: function
  dispatch:
    CPU: legacy::cpu::_th_multinomial_alias_draw
    CUDA: legacy::cuda::_th_multinomial_alias_draw

- func: lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: _lgamma_out_cpu
    CUDA: _lgamma_out_cuda

- func: lgamma(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: lgamma
    CUDA: lgamma

- func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: digamma(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)

- func: polygamma(int n, Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: erfinv(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: erfinv
    CUDA: erfinv

- func: erfinv_(Tensor(a!) self) -> Tensor(a!)
  variants: method
  dispatch:
    CPU: _erfinv__cpu
    CUDA: _erfinv__cuda

- func: erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: _erfinv_out_cpu
    CUDA: _erfinv_out_cuda

- func: sign(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method

- func: sign_(Tensor(a!) self) -> Tensor(a!)
  variants: method

- func: sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: sign_out
    CUDA: sign_out

- func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)

- func: atan2(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: lerp_cpu_scalar_out
    CUDA: lerp_cuda_scalar_out

- func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: lerp_cpu_tensor_out
    CUDA: lerp_cuda_tensor_out

- func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: lerp_cpu_scalar
    CUDA: lerp_cuda_scalar

- func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: lerp_cpu_tensor
    CUDA: lerp_cuda_tensor

- func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: legacy::cpu::_th_histc_out
    CUDA: _histc_out_cuda

- func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_histc
    CUDA: _histc_cuda

- func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: fmod_out
    CUDA: fmod_cuda_out

- func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: fmod
    CUDA: fmod_cuda

- func: fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: fmod_out
    CUDA: fmod_cuda_out

- func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: fmod
    CUDA: fmod_cuda

- func: remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: remainder_out
    CUDA: remainder_out

- func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: remainder
    CUDA: remainder

- func: remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: remainder_out
    CUDA: remainder_out

- func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: remainder
    CUDA: remainder

- func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)

- func: min.other(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: min(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: min
    CUDA: min
    QuantizedCPU: min_quant

- func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)

- func: max.other(Tensor self, Tensor other) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: max(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: max
    CUDA: max
    QuantizedCPU: max_quant

- func: median(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: median_cpu
    CUDA: median_cuda

- func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
  dispatch:
    CPU: legacy::cpu::_th_sort_out
    CUDA: legacy::cuda::_th_sort_out

- func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_sort
    CUDA: legacy::cuda::_th_sort
    QuantizedCPU: sort_quant

- func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)

- func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
  variants: method, function

- func: argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
  variants: method, function

- func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) ->(Tensor(a!) values, Tensor(b!) indices)
  dispatch:
    CPU: topk_out_cpu
    CUDA: legacy::cuda::_th_topk_out

- func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: topk
    CUDA: topk
    QuantizedCPU: quantized_topk_cpu

- func: all(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function

- func: any(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: any
    CUDA: any
    SparseCPU: any_sparse
    SparseCUDA: any_sparse

- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: legacy::cpu::_th_renorm_out
    CUDA: legacy::cuda::_th_renorm_out

- func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_renorm
    CUDA: legacy::cuda::_th_renorm

- func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
  use_c10_dispatcher: full
  variants: method
  device_guard: False
  dispatch:
    CPU: unfold
    CUDA: unfold
    QuantizedCPU: unfold
    QuantizedCUDA: unfold

- func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor
  variants: function
  dispatch:
    CPU: unfold_backward
    CUDA: unfold_backward

- func: equal(Tensor self, Tensor other) -> bool
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: legacy::cpu::_th_equal
    CUDA: legacy::cuda::_th_equal
    QuantizedCPU: quantized_equal_cpu

- func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: pow_out
    CUDA: pow_out

- func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
  use_c10_dispatcher: full
  variants: method, function
  dispatch:
    CPU: pow
    CUDA: pow

- func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: pow_out
    CUDA: pow_out

- func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: pow
    CUDA: pow

- func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
  variants: method

- func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)

- func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor

- func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)

- func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor

- func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)

- func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor

- func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

- func: normal.float_float_out(float mean, float std, int[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)

- func: alias(Tensor(a) self) -> Tensor(a)
  use_c10_dispatcher: full
  variants: method, function

- func: _addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: legacy::cpu::_th_addr
    CUDA: addr_cuda

- func: _addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
  dispatch:
    CPU: legacy::cpu::_th_addr_
    CUDA: addr__cuda

- func: _addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: legacy::cpu::_th_addr_out
    CUDA: addr_out_cuda

- func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
  dispatch:
    CPU: legacy::cpu::_th_index_copy_
    CUDA: legacy::cuda::_th_index_copy_

- func: _cumsum(Tensor self, int dim) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: _cumsum_cpu
    CUDA: _cumsum_cuda

- func: _cumsum.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: _cumsum_out_cpu
    CUDA: _cumsum_out_cuda

- func: _cumprod(Tensor self, int dim) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: _cumprod_cpu
    CUDA: _cumprod_cuda

- func: _cumprod.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: _cumprod_out_cpu
    CUDA: _cumprod_out_cuda

- func: _amp_non_finite_check_and_unscale_(Tensor(a!) self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
  variants: function
  dispatch:
    CUDA: _amp_non_finite_check_and_unscale_cuda_

- func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor
  variants: function
  dispatch:
    CUDA: _amp_update_scale_cuda

- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: _cat_cpu
    CUDA: cat_cuda
    QuantizedCPU: quantized_cat

- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: _cat_out_cpu
    CUDA: cat_out_cuda
    QuantizedCPU: quantized_cat_out

- func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  dispatch:
    CPU: legacy::cpu::_th_mode
    CUDA: legacy::cuda::_th_mode

- func: _mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
  dispatch:
    CPU: legacy::cpu::_th_mode_out
    CUDA: legacy::cuda::_th_mode_out

- func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: bucketize_cpu
    CUDA: bucketize_cuda

- func: bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: bucketize_out_cpu
    CUDA: bucketize_out_cuda

- func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: bucketize_cpu
    CUDA: bucketize_cuda

- func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: searchsorted_cpu
    CUDA: searchsorted_cuda

- func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU: searchsorted_out_cpu
    CUDA: searchsorted_out_cuda

- func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: searchsorted_cpu
    CUDA: searchsorted_cuda

## NN wrappers

- func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: mse_loss_backward_out
    CUDA: mse_loss_backward_out

- func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: mse_loss_backward
    CUDA: mse_loss_backward

- func: l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: l1_loss_backward_out
    CUDA: l1_loss_backward_out

- func: l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: multi_margin_loss_cpu_out
    CUDA: legacy::cuda::_thnn_multi_margin_loss_forward_out

- func: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor
  python_module: nn
  dispatch:
    CPU: multi_margin_loss_cpu
    CUDA: legacy::cuda::_thnn_multi_margin_loss_forward

- func: multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: multi_margin_loss_cpu_backward_out
    CUDA: legacy::cuda::_thnn_multi_margin_loss_backward_out

- func: multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor
  python_module: nn
  dispatch:
    CPU: multi_margin_loss_cpu_backward
    CUDA: legacy::cuda::_thnn_multi_margin_loss_backward

- func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CPU: multilabel_margin_loss_forward_out_cpu
    CUDA: legacy::cuda::_thnn_multilabel_margin_loss_forward_out

- func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: multilabel_margin_loss_forward_cpu
    CUDA: legacy::cuda::_thnn_multilabel_margin_loss_forward

- func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: multilabel_margin_loss_backward_cpu_out
    CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward_out

- func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: multilabel_margin_loss_backward_cpu
    CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward

- func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor
  python_module: nn

- func: nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CPU: nll_loss_forward_out_cpu
    CUDA: legacy::cuda::_thnn_nll_loss_forward_out

- func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
  python_module: nn
  dispatch:
    CPU: nll_loss_forward_cpu
    CUDA: legacy::cuda::_thnn_nll_loss_forward

- func: nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: nll_loss_backward_out_cpu
    CUDA: legacy::cuda::_thnn_nll_loss_backward_out

- func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor
  python_module: nn
  dispatch:
    CPU: nll_loss_backward_cpu
    CUDA: legacy::cuda::_thnn_nll_loss_backward

- func: nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor
  python_module: nn

- func: nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CPU: nll_loss2d_forward_out_cpu
    CUDA: legacy::cuda::_thnn_nll_loss2d_forward_out

- func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
  python_module: nn
  dispatch:
    CPU: nll_loss2d_forward_cpu
    CUDA: legacy::cuda::_thnn_nll_loss2d_forward

- func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: nll_loss2d_backward_out_cpu
    CUDA: legacy::cuda::_thnn_nll_loss2d_backward_out

- func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor
  python_module: nn
  dispatch:
    CPU: nll_loss2d_backward_cpu
    CUDA: legacy::cuda::_thnn_nll_loss2d_backward

- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: smooth_l1_loss_out
    CUDA: smooth_l1_loss_out

- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: smooth_l1_loss_backward_out
    CUDA: smooth_l1_loss_backward_out

- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: soft_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: soft_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn

- func: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: elu_backward_out
    CUDA: elu_backward_out

- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
  python_module: nn

- func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: glu_out
    CUDA: legacy::cuda::_thnn_glu_forward_out

- func: glu(Tensor self, int dim=-1) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: glu
    CUDA: legacy::cuda::_thnn_glu_forward

- func: glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: glu_backward_out
    CUDA: legacy::cuda::_thnn_glu_backward_out

- func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: glu_backward
    CUDA: legacy::cuda::_thnn_glu_backward

- func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: hardsigmoid(Tensor self) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: hardsigmoid
    CUDA: hardsigmoid
    QuantizedCPU: quantized_hardsigmoid

- func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
  python_module: nn

- func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: hardsigmoid_backward
    CUDA: hardsigmoid_backward

- func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: hardtanh_out
    CUDA: hardtanh_out
    QuantizedCPU: quantized_hardtanh_out

- func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: hardtanh
    CUDA: hardtanh
    QuantizedCPU: quantized_hardtanh

- func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: hardtanh_backward_out
    CUDA: hardtanh_backward_out

- func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: hardtanh_
    CUDA: hardtanh_
    QuantizedCPU: quantized_hardtanh_
    Vulkan: vulkan_hardtanh_

- func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: hardswish(Tensor self) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: hardswish_(Tensor(a!) self) -> Tensor(a!)
  python_module: nn

- func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: hardswish_backward
    CUDA: hardswish_backward

- func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: leaky_relu_out
    CUDA: leaky_relu_out
    QuantizedCPU: quantized_leaky_relu_out

- func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: leaky_relu
    CUDA: leaky_relu
    QuantizedCPU: quantized_leaky_relu

- func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: leaky_relu_
    CUDA: leaky_relu_
    QuantizedCPU: quantized_leaky_relu_

- func: log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: log_sigmoid(Tensor self) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CPU: log_sigmoid_forward_out_cpu
    CUDA: legacy::cuda::_thnn_log_sigmoid_forward_out

- func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: log_sigmoid_forward_cpu
    CUDA: legacy::cuda::_thnn_log_sigmoid_forward

- func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: log_sigmoid_backward_out_cpu
    CUDA: legacy::cuda::_thnn_log_sigmoid_backward_out

- func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: log_sigmoid_backward_cpu
    CUDA: legacy::cuda::_thnn_log_sigmoid_backward

- func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: rrelu_with_noise_out_cpu
    CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward_out

- func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
  python_module: nn
  dispatch:
    CPU: rrelu_with_noise_cpu
    CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward

- func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: rrelu_with_noise_cpu_
    CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward_

- func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: softplus_backward_out
    CUDA: softplus_backward_out

- func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: softshrink_backward_out
    CUDA: softshrink_backward_out

- func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: adaptive_avg_pool2d_out_cpu
    CUDA: adaptive_avg_pool2d_out_cuda
    MkldnnCPU: mkldnn_adaptive_avg_pool2d_out

- func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    MkldnnCPU: mkldnn_adaptive_avg_pool2d
  requires_tensor: True

- func: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
  use_c10_dispatcher: full
  dispatch:
    CPU: adaptive_avg_pool2d_cpu
    CUDA: adaptive_avg_pool2d_cuda
    QuantizedCPU: quantized_adaptive_avg_pool2d

- func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: adaptive_avg_pool2d_backward_cpu
    CUDA: adaptive_avg_pool2d_backward_cuda

- func: adaptive_avg_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: adaptive_avg_pool3d_out_cpu
    CUDA: adaptive_avg_pool3d_out_cuda

- func: adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: adaptive_avg_pool3d_cpu
    CUDA: adaptive_avg_pool3d_cuda

- func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: adaptive_avg_pool3d_backward_out_cpu
    CUDA: adaptive_avg_pool3d_backward_out_cuda

- func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: adaptive_avg_pool3d_backward_cpu
    CUDA: adaptive_avg_pool3d_backward_cuda

# Return: (Tensor output, Tensor indices)
- func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CPU: adaptive_max_pool2d_out_cpu
    CUDA: adaptive_max_pool2d_out_cuda

# Return: (Tensor output, Tensor indices)
- func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: adaptive_max_pool2d_cpu
    CUDA: adaptive_max_pool2d_cuda

- func: adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: adaptive_max_pool2d_backward_out_cpu
    CUDA: adaptive_max_pool2d_backward_out_cuda

- func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: adaptive_max_pool2d_backward_cpu
    CUDA: adaptive_max_pool2d_backward_cuda

# Return: (Tensor output, Tensor indices)
- func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CPU: adaptive_max_pool3d_out_cpu
    CUDA: adaptive_max_pool3d_out_cuda

# Return: (Tensor output, Tensor indices)
- func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: adaptive_max_pool3d_cpu
    CUDA: adaptive_max_pool3d_cuda

- func: adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: adaptive_max_pool3d_backward_out_cpu
    CUDA: adaptive_max_pool3d_backward_out_cuda

- func: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: adaptive_max_pool3d_backward_cpu
    CUDA: adaptive_max_pool3d_backward_cuda

- func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: avg_pool2d_out_cpu
    CUDA: avg_pool2d_out_cuda
    MkldnnCPU: mkldnn_avg_pool2d_out

- func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: avg_pool2d_cpu
    CUDA: avg_pool2d_cuda
    MkldnnCPU: mkldnn_avg_pool2d
    QuantizedCPU: quantized_avg_pool2d

- func: avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: avg_pool2d_backward_out_cpu
    CUDA: avg_pool2d_backward_out_cuda

- func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: avg_pool2d_backward_cpu
    CUDA: avg_pool2d_backward_cuda

- func: avg_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: avg_pool3d_out_cpu
    CUDA: avg_pool3d_out_cuda

- func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: avg_pool3d_cpu
    CUDA: avg_pool3d_cuda
    QuantizedCPU: quantized_avg_pool3d

- func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: avg_pool3d_backward_out_cpu
    CUDA: avg_pool3d_backward_out_cuda

- func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: avg_pool3d_backward_cpu
    CUDA: avg_pool3d_backward_cuda

# Return: (Tensor output, Tensor indices)
- func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CPU: fractional_max_pool2d_out_cpu
    CUDA: fractional_max_pool2d_out_cuda

# Return: (Tensor output, Tensor indices)
- func: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: fractional_max_pool2d_cpu
    CUDA: fractional_max_pool2d_cuda

- func: fractional_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: fractional_max_pool2d_backward_out_cpu
    CUDA: fractional_max_pool2d_backward_out_cuda

- func: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: fractional_max_pool2d_backward_cpu
    CUDA: fractional_max_pool2d_backward_cuda

# Return: (Tensor output, Tensor indices)
- func: fractional_max_pool3d.output(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CPU: fractional_max_pool3d_out_cpu
    CUDA: fractional_max_pool3d_out_cuda

# Return: (Tensor output, Tensor indices)
- func: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: fractional_max_pool3d_cpu
    CUDA: fractional_max_pool3d_cuda

- func: fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: fractional_max_pool3d_backward_out_cpu
    CUDA: fractional_max_pool3d_backward_out_cuda

- func: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: fractional_max_pool3d_backward_cpu
    CUDA: fractional_max_pool3d_backward_cuda

# Return: (Tensor output, Tensor indices)
- func: max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CPU: max_pool2d_with_indices_out_cpu
    CUDA: max_pool2d_with_indices_out_cuda

# Return: (Tensor output, Tensor indices)
- func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: max_pool2d_with_indices_cpu
    CUDA: max_pool2d_with_indices_cuda

- func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: max_pool2d_with_indices_backward_out_cpu
    CUDA: max_pool2d_with_indices_backward_out_cuda

- func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: max_pool2d_with_indices_backward_cpu
    CUDA: max_pool2d_with_indices_backward_cuda

# Return: (Tensor output, Tensor indices)
- func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CPU: max_pool3d_with_indices_out_cpu
    CUDA: max_pool3d_with_indices_out_cuda

# Return: (Tensor output, Tensor indices)
- func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: max_pool3d_with_indices_cpu
    CUDA: max_pool3d_with_indices_cuda

- func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: max_pool3d_with_indices_backward_out_cpu
    CUDA: max_pool3d_with_indices_backward_out_cuda

- func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: max_pool3d_with_indices_backward_cpu
    CUDA: max_pool3d_with_indices_backward_cuda

- func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: max_unpooling2d_forward_out_cpu
    CUDA: max_unpooling2d_forward_out_cuda

- func: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: max_unpooling2d_forward_cpu
    CUDA: max_unpooling2d_forward_cuda

- func: max_unpool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: max_unpooling2d_backward_out_cpu
    CUDA: max_unpooling2d_backward_out_cuda

- func: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: max_unpooling2d_backward_cpu
    CUDA: max_unpooling2d_backward_cuda

- func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: max_unpooling3d_forward_out_cpu
    CUDA: max_unpooling3d_forward_out_cuda

- func: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: max_unpooling3d_forward_cpu
    CUDA: max_unpooling3d_forward_cuda

- func: max_unpool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: max_unpooling3d_backward_out_cpu
    CUDA: max_unpooling3d_backward_out_cuda

- func: max_unpool3d_backward(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: max_unpooling3d_backward_cpu
    CUDA: max_unpooling3d_backward_cuda

- func: reflection_pad1d.out(Tensor self, int[2] padding, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: reflection_pad1d_out_cpu
    CUDA: reflection_pad1d_out_cuda

- func: reflection_pad1d(Tensor self, int[2] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: reflection_pad1d_cpu
    CUDA: reflection_pad1d_cuda
    QuantizedCPU: reflection_pad1d_cpu

- func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, int[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: reflection_pad1d_backward_out_cpu
    CUDA: reflection_pad1d_backward_out_cuda

- func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: reflection_pad1d_backward_cpu
    CUDA: reflection_pad1d_backward_cuda

- func: reflection_pad2d.out(Tensor self, int[4] padding, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: reflection_pad2d_out_cpu
    CUDA: reflection_pad2d_out_cuda

- func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: reflection_pad2d_cpu
    CUDA: reflection_pad2d_cuda

- func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: reflection_pad2d_backward_out_cpu
    CUDA: reflection_pad2d_backward_out_cuda

- func: reflection_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: reflection_pad2d_backward_cpu
    CUDA: reflection_pad2d_backward_cuda

- func: replication_pad1d.out(Tensor self, int[2] padding, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: replication_pad1d_out_cpu
    CUDA: replication_pad1d_out_cuda

- func: replication_pad1d(Tensor self, int[2] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: replication_pad1d_cpu
    CUDA: replication_pad1d_cuda

- func: replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, int[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: replication_pad1d_backward_out_cpu
    CUDA: replication_pad1d_backward_out_cuda

- func: replication_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: replication_pad1d_backward_cpu
    CUDA: replication_pad1d_backward_cuda

- func: replication_pad2d.out(Tensor self, int[4] padding, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: replication_pad2d_out_cpu
    CUDA: replication_pad2d_out_cuda

- func: replication_pad2d(Tensor self, int[4] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: replication_pad2d_cpu
    CUDA: replication_pad2d_cuda

- func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: replication_pad2d_backward_out_cpu
    CUDA: replication_pad2d_backward_out_cuda

- func: replication_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: replication_pad2d_backward_cpu
    CUDA: replication_pad2d_backward_cuda

- func: replication_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: replication_pad3d_out_cpu
    CUDA: replication_pad3d_out_cuda

- func: replication_pad3d(Tensor self, int[6] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: replication_pad3d_cpu
    CUDA: replication_pad3d_cuda

- func: replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, int[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: replication_pad3d_backward_out_cpu
    CUDA: replication_pad3d_backward_out_cuda

- func: replication_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: replication_pad3d_backward_cpu
    CUDA: replication_pad3d_backward_cuda

- func: upsample_linear1d.out(Tensor self, int[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_linear1d_out_cpu
    CUDA: upsample_linear1d_out_cuda

- func: upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_linear1d_cpu
    CUDA: upsample_linear1d_cuda

- func: upsample_linear1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_linear1d_backward_out_cpu
    CUDA: upsample_linear1d_backward_out_cuda

- func: upsample_linear1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_linear1d_backward_cpu
    CUDA: upsample_linear1d_backward_cuda

- func: upsample_bilinear2d.out(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_bilinear2d_out_cpu
    CUDA: upsample_bilinear2d_out_cuda

- func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_bilinear2d_cpu
    CUDA: upsample_bilinear2d_cuda
    QuantizedCPU: quantized_upsample_bilinear2d_cpu

- func: upsample_bilinear2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_bilinear2d_backward_out_cpu
    CUDA: upsample_bilinear2d_backward_out_cuda

- func: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_bilinear2d_backward_cpu
    CUDA: upsample_bilinear2d_backward_cuda

- func: upsample_bicubic2d.out(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_bicubic2d_out_cpu
    CUDA: upsample_bicubic2d_out_cuda

- func: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_bicubic2d_cpu
    CUDA: upsample_bicubic2d_cuda

- func: upsample_bicubic2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_bicubic2d_backward_out_cpu
    CUDA: upsample_bicubic2d_backward_out_cuda

- func: upsample_bicubic2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_bicubic2d_backward_cpu
    CUDA: upsample_bicubic2d_backward_cuda

- func: upsample_trilinear3d.out(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_trilinear3d_out_cpu
    CUDA: upsample_trilinear3d_out_cuda

- func: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_trilinear3d_cpu
    CUDA: upsample_trilinear3d_cuda

- func: upsample_trilinear3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_trilinear3d_backward_out_cpu
    CUDA: upsample_trilinear3d_backward_out_cuda

- func: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_trilinear3d_backward_cpu
    CUDA: upsample_trilinear3d_backward_cuda

- func: upsample_nearest1d.out(Tensor self, int[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_nearest1d_out_cpu
    CUDA: upsample_nearest1d_out_cuda

- func: upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_nearest1d_cpu
    CUDA: upsample_nearest1d_cuda

- func: upsample_nearest1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_nearest1d_backward_out_cpu
    CUDA: upsample_nearest1d_backward_out_cuda

- func: upsample_nearest1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_nearest1d_backward_cpu
    CUDA: upsample_nearest1d_backward_cuda

- func: upsample_nearest2d.out(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_nearest2d_out_cpu
    CUDA: upsample_nearest2d_out_cuda

- func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_nearest2d_cpu
    CUDA: upsample_nearest2d_cuda
    QuantizedCPU: quantized_upsample_nearest2d_cpu
    Vulkan: upsample_nearest2d_vulkan

- func: upsample_nearest2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_nearest2d_backward_out_cpu
    CUDA: upsample_nearest2d_backward_out_cuda

- func: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_nearest2d_backward_cpu
    CUDA: upsample_nearest2d_backward_cuda

- func: upsample_nearest3d.out(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_nearest3d_out_cpu
    CUDA: upsample_nearest3d_out_cuda

- func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_nearest3d_cpu
    CUDA: upsample_nearest3d_cuda
    QuantizedCPU: quantized_upsample_nearest3d_cpu

- func: upsample_nearest3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: upsample_nearest3d_backward_out_cpu
    CUDA: upsample_nearest3d_backward_out_cuda

- func: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: upsample_nearest3d_backward_cpu
    CUDA: upsample_nearest3d_backward_cuda

- func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: sigmoid_backward_out
    CUDA: sigmoid_backward_out

- func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

- func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: tanh_backward_out
    CUDA: tanh_backward_out

- func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
  use_c10_dispatcher: full
  python_module: nn

# What's a thnn_conv_ versus a slow_conv_?
#
# Historically, we have inefficient implementations of convolutions
# coming from the THNN/THCUNN library.  These convolutions typically
# operated by computing the Toeplitz matrix and then doing a matrix
# multiply with the input; this is very memory inefficient!  However,
# occasionally, we really don't have anything better, so it's helpful
# to have these fallbacks when there is no more optimized implementation
# in cudnn or mkldnn, etc.  Both thnn_ and slow_ convolutions fall
# into this bucket.
#
# The difference between these two designations, is that thnn_ refers
# to a convolution that is still written in the "legacy" style; that is,
# C code in the THNN/ or THCUNN/ directory.  A slow_ convolution is
# one that is written in the native style: modern C++.  Algorithmically,
# these are the same thing, but we give them different prefixes to
# make the operational distinction clear.

- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: slow_conv_transpose2d_out_cpu
    CUDA: slow_conv_transpose2d_out_cuda

- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor
  python_module: nn
  dispatch:
    CPU: slow_conv_transpose2d_cpu
    CUDA: slow_conv_transpose2d_cuda

- func: slow_conv_transpose2d_backward.grad_output(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, Tensor columns, Tensor ones, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
  python_module: nn
  dispatch:
    CPU: slow_conv_transpose2d_backward_out_cpu
    CUDA: slow_conv_transpose2d_backward_out_cuda

- func: slow_conv_transpose2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, Tensor columns, Tensor ones, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: slow_conv_transpose2d_backward_cpu
    CUDA: slow_conv_transpose2d_backward_cuda

- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: slow_conv_transpose3d_out_cpu
    CUDA: slow_conv_transpose3d_out_cuda

- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor
  python_module: nn
  dispatch:
    CPU: slow_conv_transpose3d_cpu
    CUDA: slow_conv_transpose3d_cuda

- func: slow_conv_transpose3d_backward.grad_output(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] output_padding, int[3] dilation, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
  python_module: nn
  dispatch:
    CPU: slow_conv_transpose3d_backward_out_cpu
    CUDA: slow_conv_transpose3d_backward_out_cuda

- func: slow_conv_transpose3d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] output_padding, int[3] dilation, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: slow_conv_transpose3d_backward_cpu
    CUDA: slow_conv_transpose3d_backward_cuda

- func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: thnn_conv2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0) -> Tensor
  python_module: nn

- func: thnn_conv2d_forward.output(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, *, Tensor(a!) output, Tensor(b!) finput, Tensor(c!) fgrad_input) -> (Tensor(a!), Tensor(b!), Tensor(c!))
  python_module: nn
  dispatch:
    CPU: slow_conv2d_forward_out_cpu
    CUDA: legacy::cuda::_thnn_conv2d_forward_out

- func: thnn_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> (Tensor output, Tensor finput, Tensor fgrad_input)
  python_module: nn
  dispatch:
    CPU: slow_conv2d_forward_cpu
    CUDA: legacy::cuda::_thnn_conv2d_forward

- func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
  python_module: nn
  dispatch:
    CPU: slow_conv2d_backward_out_cpu
    CUDA: slow_conv2d_backward_out_cuda

- func: thnn_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: slow_conv2d_backward_cpu
    CUDA: slow_conv2d_backward_cuda

- func: thnn_conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: thnn_conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor
  python_module: nn

- func: thnn_conv_depthwise2d_forward.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CUDA: legacy::cuda::_thnn_conv_depthwise2d_forward_out

- func: thnn_conv_depthwise2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor
  python_module: nn
  dispatch:
    CUDA: legacy::cuda::_thnn_conv_depthwise2d_forward

- func: thnn_conv_depthwise2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight) -> (Tensor(a!), Tensor(b!))
  python_module: nn
  dispatch:
    CUDA: thnn_conv_depthwise2d_backward_out

- func: thnn_conv_depthwise2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[2] output_mask) -> (Tensor grad_input, Tensor grad_weight)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CUDA: thnn_conv_depthwise2d_backward

- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn

- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0) -> Tensor
  python_module: nn

- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, *, Tensor(a!) output, Tensor(b!) finput, Tensor(c!) fgrad_input) -> (Tensor(a!), Tensor(b!), Tensor(c!))
  python_module: nn
  dispatch:
    CPU: slow_conv3d_forward_out_cpu

- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> (Tensor output, Tensor finput, Tensor fgrad_input)
  python_module: nn
  dispatch:
    CPU: slow_conv3d_forward_cpu

- func: slow_conv3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
  python_module: nn
  dispatch:
    CPU: slow_conv3d_backward_out_cpu

- func: slow_conv3d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: slow_conv3d_backward_cpu

- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor
  python_module: nn
  dispatch:
    CPU: slow_conv_dilated2d_cpu
    CUDA: slow_conv_dilated2d_cuda

- func: slow_conv_dilated2d_backward(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: slow_conv_dilated2d_backward_cpu
    CUDA: slow_conv_dilated2d_backward_cuda

- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
  python_module: nn
  dispatch:
    CPU: slow_conv_dilated3d_cpu
    CUDA: slow_conv_dilated3d_cuda

- func: slow_conv_dilated3d_backward(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: slow_conv_dilated3d_backward_cpu
    CUDA: slow_conv_dilated3d_backward_cuda

- func: col2im.out(Tensor self, int[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: col2im_out_cpu
    CUDA: col2im_out_cuda

- func: col2im(Tensor self, int[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: col2im_cpu
    CUDA: col2im_cuda

- func: col2im_backward.grad_input(Tensor grad_output, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: col2im_backward_out_cpu
    CUDA: col2im_backward_out_cuda

- func: col2im_backward(Tensor grad_output, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: col2im_backward_cpu
    CUDA: col2im_backward_cuda

- func: im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: im2col_out_cpu
    CUDA: im2col_out_cuda

- func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: im2col_cpu
    CUDA: im2col_cuda

- func: im2col_backward.grad_input(Tensor grad_output, int[2] input_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) grad_input) -> Tensor(a!)
  python_module: nn
  dispatch:
    CPU: im2col_backward_out_cpu
    CUDA: im2col_backward_out_cuda

- func: im2col_backward(Tensor grad_output, int[2] input_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
  use_c10_dispatcher: full
  python_module: nn
  dispatch:
    CPU: im2col_backward_cpu
    CUDA: im2col_backward_cuda

- func: isfinite(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

- func: isinf(Tensor self) -> Tensor
  use_c10_dispatcher: full
  variants: function, method
  device_guard: False

# Note: this function is only for testing.
# It is undocumented and should not be used outside of tests.
- func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
  use_c10_dispatcher: full