codegen/native_functions.yaml in torch-rb-0.13.2 vs codegen/native_functions.yaml in torch-rb-0.14.0
- old
+ new
@@ -168,13 +168,41 @@
- func: _assert_async(Tensor self) -> ()
dispatch:
CPU: _assert_async_cpu
CUDA: _assert_async_cuda
+- func: _assert_async.msg(Tensor self, str assert_msg) -> ()
+ dispatch:
+ CPU: _assert_async_msg_cpu
+ CUDA: _assert_async_msg_cuda
-- func: _assert_tensor_metadata(Tensor a, int[]? size=None, int[]? stride=None, ScalarType? dtype=None) -> ()
+- func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor
+ dispatch:
+ CPU: _functional_assert_async_msg_cpu
+- func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
+
+- func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()
+ dispatch:
+ CompositeExplicitAutograd: sym_constrain_range
+
+- func: sym_constrain_range_for_size(Scalar size, *, int? min, int? max) -> ()
+ dispatch:
+ CompositeExplicitAutograd: sym_constrain_range_for_size
+
+- func: _functional_sym_constrain_range(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
+ dispatch:
+ CompositeExplicitAutograd: _functional_sym_constrain_range
+
+- func: _functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
+ dispatch:
+ CompositeExplicitAutograd: _functional_sym_constrain_range_for_size
+
+- func: _make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+ dispatch:
+ CPU: _make_dep_token_cpu
+
- func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
variants: method
- func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
device_check: NoCheck # Tensor arguments allowed to be on different devices, see also _cudnn_ctc_loss
@@ -209,20 +237,22 @@
# Disable dispatch time device check for consistent behavior.
device_check: NoCheck
dispatch:
CUDA: _cudnn_rnn
autogen: _cudnn_rnn.out
+ tags: nondeterministic_seeded
- func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
dispatch:
CUDA: _cudnn_rnn_backward
autogen: _cudnn_rnn_backward.out
- func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
dispatch:
CUDA: _cudnn_init_dropout_state
autogen: _cudnn_init_dropout_state.out
+ tags: nondeterministic_seeded
- func: _debug_has_internal_overlap(Tensor self) -> int
variants: function
- func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
@@ -295,19 +325,21 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: abs
SparseCPU, SparseCUDA: abs_sparse
SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs
tags: [core, pointwise]
- func: abs_(Tensor(a!) self) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: abs_
SparseCPU, SparseCUDA: abs_sparse_
SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_
- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA: abs_out
@@ -372,26 +404,28 @@
CPU, CUDA, MPS, Meta: view_as_real
- func: view_as_complex(Tensor(a) self) -> Tensor(a)
variants: function
dispatch:
- CPU, CUDA, Meta: view_as_complex
+ CPU, CUDA, MPS, Meta: view_as_complex
- func: sgn(Tensor self) -> Tensor
variants: function, method
structured_delegate: sgn.out
dispatch:
SparseCPU, SparseCUDA: sgn_sparse
SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn
tags: pointwise
- func: sgn_(Tensor(a!) self) -> Tensor(a!)
variants: method
structured_delegate: sgn.out
dispatch:
SparseCPU, SparseCUDA: sgn_sparse_
SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_
tags: pointwise
- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
structured: True
structured_inherits: TensorIteratorBase
@@ -486,12 +520,14 @@
variants: function, method
- func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
- func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
+ tags: core
- func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor
+ tags: core
# Return: (Tensor output, Tensor indices)
- func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
- func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@@ -608,17 +644,17 @@
dispatch:
CPU, CUDA: addr_out
MPS: addr_out_mps
CompositeExplicitAutograd: math_addr_out
-- func: affine_grid_generator(Tensor theta, int[] size, bool align_corners) -> Tensor
+- func: affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor
variants: function
dispatch:
CompositeExplicitAutograd: affine_grid_generator
autogen: affine_grid_generator.out
-- func: affine_grid_generator_backward(Tensor grad, int[] size, bool align_corners) -> Tensor
+- func: affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor
variants: function
- func: _is_all_true(Tensor self) -> Tensor
variants: function, method
dispatch:
@@ -631,10 +667,17 @@
# Note: this function is only for testing.
- func: _test_check_tensor(Tensor self) -> Tensor
variants: function
+# Note; this function is only for testing
+- func: _test_functorch_fallback(Tensor self, Tensor other) -> Tensor
+ variants: function
+ dispatch:
+ CPU: _test_functorch_fallback
+ autogen: _test_functorch_fallback.out
+
- func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
structured_delegate: all.out
variants: function, method
@@ -662,10 +705,11 @@
- func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
structured_delegate: any.out
variants: function, method
+ tags: core
- func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured: True
precomputed:
@@ -1106,18 +1150,19 @@
device_check: NoCheck # TensorIterator
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: bitwise_not_out
+ MPS: bitwise_not_out_mps
tags: pointwise
- func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured: True
structured_inherits: TensorIteratorBase
dispatch:
- CPU, CUDA: copysign_out
+ CPU, CUDA, MPS: copysign_out
tags: pointwise
- func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
@@ -1148,17 +1193,19 @@
- func: logical_not(Tensor self) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: logical_not
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
tags: [core, pointwise]
- func: logical_not_(Tensor(a!) self) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: method
dispatch:
CompositeExplicitAutograd: logical_not_
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
tags: pointwise
- func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
@@ -1169,11 +1216,11 @@
- func: logical_xor(Tensor self, Tensor other) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: logical_xor
- tags: pointwise
+ tags: [core, pointwise]
- func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: method
dispatch:
@@ -1324,11 +1371,11 @@
structured_delegate: ceil.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA: ceil_sparse
SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr
- tags: pointwise
+ tags: [core, pointwise]
- func: ceil_(Tensor(a!) self) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured_delegate: ceil.out
variants: function, method
@@ -1391,11 +1438,11 @@
tags: [core, pointwise]
- func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
variants: function, method
structured_delegate: clamp.Tensor_out
- tags: pointwise
+ tags: [core, pointwise]
- func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ['min']
@@ -1550,10 +1597,11 @@
CompositeExplicitAutograd: polar
- func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA: polar_out
+ MPS: polar_out_mps
- func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
variants: function
dispatch:
CompositeExplicitAutograd: constant_pad_nd
@@ -1596,15 +1644,21 @@
- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, int[] stride, str padding, int[] dilation, int groups) -> Tensor
- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
+- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
+ dispatch:
+ CompositeImplicitAutograd: conv1d_symint
-- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
+- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
+ dispatch:
+ CompositeImplicitAutograd: conv2d_symint
-- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor
+- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor
+ dispatch:
+ CompositeImplicitAutograd: conv3d_symint
- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, str padding="valid", int[1] dilation=1, int groups=1) -> Tensor
cpp_no_default_args: ['bias', 'stride', 'padding']
- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, str padding="valid", int[2] dilation=1, int groups=1) -> Tensor
@@ -1619,15 +1673,21 @@
autogen: conv_tbc.out
- func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)
# NB: we inherit the goofy argument order from PyTorch torch.nn.functional
-- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor
+- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor
+ dispatch:
+ CompositeImplicitAutograd: conv_transpose1d_symint
-- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor
+- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor
+ dispatch:
+ CompositeImplicitAutograd: conv_transpose2d_symint
-- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
+- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
+ dispatch:
+ CompositeImplicitAutograd: conv_transpose3d_symint
- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
variants: function
dispatch:
CompositeExplicitAutogradNonFunctional: copy
@@ -1848,10 +1908,11 @@
- func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
structured: True
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA: cumprod_out
+ MPS: cumprod_out_mps
- func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
@@ -1868,10 +1929,11 @@
- func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
structured_delegate: cumsum.out
device_check: NoCheck # TensorIterator
variants: function, method
+ tags: core
- func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
structured_delegate: cumsum.out
variants: method
@@ -2143,10 +2205,11 @@
- func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
dispatch:
CompositeExplicitAutograd: embedding_symint
NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
autogen: embedding.out
+ tags: core
- func: embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
dispatch:
CompositeImplicitAutograd: embedding_backward_symint
@@ -2200,10 +2263,11 @@
- func: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
dispatch:
CPU: _embedding_bag_cpu
CUDA: _embedding_bag_cuda
autogen: _embedding_bag.out
+ tags: core
- func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
dispatch:
CompositeImplicitAutograd: _embedding_bag_backward_symint
@@ -2238,11 +2302,17 @@
Meta: empty_meta_symint
MkldnnCPU: empty_mkldnn
SparseCPU, SparseCUDA, SparseMeta: empty_sparse
SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
+ tags: core
+- func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+ dispatch:
+ CompositeExplicitAutograd: empty_permuted_symint
+ autogen: empty_permuted.out
+
# We do not make new_empty a composite that calls into new_empty_strided, as the strided version
# is significantly more difficult to implement by different backends
- func: new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
variants: method
dispatch:
@@ -2278,19 +2348,19 @@
# non-differentiable so NonFunctional doesn't apply
CompositeExplicitAutograd: new_ones
autogen: new_ones.out
# other overrides are to provide a more helpful error message that dtype is required
-- func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
+- func: _empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
dispatch:
CPU: empty_affine_quantized_other_backends_stub
QuantizedCPU, QuantizedCUDA: empty_affine_quantized
autogen: _empty_affine_quantized.out
# it's a factory function receiving a tensor argument, thus overriding explicitly
# other overrides are to provide a more helpful error message that dtype is required
-- func: _empty_per_channel_affine_quantized(int[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+- func: _empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
category_override: factory
dispatch:
CPU: empty_per_channel_affine_quantized_other_backends_stub
QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized
autogen: _empty_per_channel_affine_quantized.out
@@ -2311,11 +2381,11 @@
autogen: resize, resize.out
# This is a utility function to enable users to resize out tensor while registering kernels for out variants.
# Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration
# to make it easy to register out variants for ops.
-- func: _resize_output_(Tensor(a!) self, int[] size, Device device) -> Tensor(a!)
+- func: _resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!)
use_const_ref_for_mutable_tensors: True
variants: function
dispatch:
Meta: _resize_output_
autogen: _resize_output, _resize_output.out
@@ -2481,25 +2551,25 @@
variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too.
device_check: NoCheck
device_guard: False
# decomposes to eye.m
-- func: eye(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
CompositeExplicitAutograd: eye
-- func: eye.m(int n, int m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
CompositeExplicitAutograd: eye
-- func: eye.out(int n, *, Tensor(a!) out) -> Tensor(a!)
+- func: eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, Meta: eye_out_cpu
CUDA: eye_out_cuda
MPS: eye_out_mps
-- func: eye.m_out(int n, int m, *, Tensor(a!) out) -> Tensor(a!)
+- func: eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, Meta: eye_out_cpu
CUDA: eye_out_cuda
MPS: eye_out_mps
@@ -2513,15 +2583,19 @@
variants: function, method
- func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
variants: function, method
-- func: unflatten.int(Tensor(a) self, int dim, int[] sizes) -> Tensor(a)
+- func: unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a)
variants: function, method
+ dispatch:
+ CompositeImplicitAutograd: unflatten_symint
-- func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a)
+- func: unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a)
variants: function, method
+ dispatch:
+ CompositeImplicitAutograd: unflatten_dimname_symint
- func: fill.Scalar(Tensor self, Scalar value) -> Tensor
variants: function
dispatch:
CompositeExplicitAutograd: fill
@@ -2837,17 +2911,17 @@
dispatch:
CPU: _fft_r2c_mkl_out
CUDA: _fft_r2c_cufft_out
# Complex to real inverse FFT
-- func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor
+- func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
variants: function
dispatch:
CPU: _fft_c2r_mkl
CUDA: _fft_c2r_cufft
-- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, int last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
+- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
variants: function
dispatch:
CPU: _fft_c2r_mkl_out
CUDA: _fft_c2r_cufft_out
@@ -2869,25 +2943,25 @@
variants: function
dispatch:
CPU: _validate_compressed_sparse_indices_cpu
CUDA: _validate_compressed_sparse_indices_cuda
-- func: _cufft_get_plan_cache_size(int device_index) -> int
+- func: _cufft_get_plan_cache_size(DeviceIndex device_index) -> int
-- func: _cufft_get_plan_cache_max_size(int device_index) -> int
+- func: _cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int
-- func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> ()
+- func: _cufft_set_plan_cache_max_size(DeviceIndex device_index, int max_size) -> ()
-- func: _cufft_clear_plan_cache(int device_index) -> ()
+- func: _cufft_clear_plan_cache(DeviceIndex device_index) -> ()
- func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
device_check: NoCheck # TensorIterator
structured_delegate: index.Tensor_out
variants: function, method
dispatch:
QuantizedCPU: quantized_index
- tags: dynamic_output_shape
+ tags: [core, dynamic_output_shape]
# NB: This function is special-cased in tools/autograd/gen_variable_type.py
# NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
# - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
# - Tensor Tensor::index(std::initializer_list<TensorIndex> indices)
@@ -2898,10 +2972,17 @@
precomputed:
- indices -> DimVector sizes, DimVector strides
dispatch:
CPU, CUDA, MPS: index_out
+# Used by inductor to signal indexing without bounds checks
+# Note that we don't support boolean indexing, to avoid dynamic output shapes
+- func: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+ variants: function
+ dispatch:
+ CPU, CUDA: _unsafe_index
+
- func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
structured: True
variants: function
precomputed:
- dim -> int dim
@@ -2937,11 +3018,18 @@
- func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
device_check: NoCheck # delegate to _index_put_impl_ after clone, which leverages TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: index_put
+ tags: core
+- func: _unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+ device_check: NoCheck # delegate to _index_put_impl_ after clone, which leverages TensorIterator
+ variants: function
+ dispatch:
+ CompositeExplicitAutograd: _unsafe_index_put
+
- func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
dispatch:
CPU, CUDA, MPS: _index_put_impl_
@@ -3095,10 +3183,11 @@
- func: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
dispatch:
CPU: layer_norm_backward_cpu
CUDA: layer_norm_backward_cuda
MPS: layer_norm_backward_mps
+ NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
autogen: native_layer_norm_backward.out
tags: core
- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
variants: function, method
@@ -3158,10 +3247,22 @@
- func: mkldnn_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
dispatch:
MkldnnCPU: mkldnn_linear_backward
autogen: mkldnn_linear_backward.out
+- func: _cslt_compress(Tensor input) -> Tensor
+ dispatch:
+ CUDA: _cslt_compress
+
+- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, bool transpose_result=False) -> Tensor
+ dispatch:
+ CUDA: _cslt_sparse_mm
+
+- func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None) -> Tensor
+ dispatch:
+ CUDA: _sparse_semi_structured_linear
+
- func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
- func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
- func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int)
@@ -3353,10 +3454,11 @@
structured: True
structured_inherits: TensorIteratorBase
variants: function
dispatch:
CPU, CUDA: xlogy_out
+ MPS: xlogy_out_mps
tags: pointwise
- func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
@@ -3508,10 +3610,11 @@
- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
device_check: NoCheck # TensorIterator
structured: True
dispatch:
CPU, CUDA: aminmax_out
+ MPS: aminmax_out_mps
- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
dispatch:
CPU, CUDA: _compute_linear_combination
@@ -3605,19 +3708,25 @@
dispatch:
QuantizedCPU: quantized_max_pool2d
QuantizedCUDA: quantized_max_pool2d_cudnn
autogen: quantized_max_pool2d.out
+- func: quantized_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+ dispatch:
+ QuantizedCPU: quantized_max_pool3d
+ autogen: quantized_max_pool3d.out
+
- func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
# The CPU and GPU dispatch variants are named weirdly here because otherwise there
# are namespacing issues in C++
- func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: mean
+ tags: core
# For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
# FIXME: fix CI jobs and re-enable this
#- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
# device_check: NoCheck # TensorIterator
@@ -3754,10 +3863,11 @@
autogen: mkldnn_convolution.out
- func: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor)
dispatch:
CPU: mkldnn_rnn_layer
+ MkldnnCPU: mkldnn_rnn_layer
autogen: mkldnn_rnn_layer.out
- func: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
dispatch:
CPU: mkldnn_rnn_layer_backward
@@ -3798,11 +3908,13 @@
- func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
dispatch:
CUDA: miopen_rnn
autogen: miopen_rnn.out
+ tags: nondeterministic_seeded
+
- func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
dispatch:
CUDA: miopen_rnn_backward
autogen: miopen_rnn_backward.out
@@ -3821,10 +3933,18 @@
CUDA: mm_out_cuda
MPS: mm_out_mps
SparseCPU, SparseCUDA: _sparse_mm_out
SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
+- func: _int_mm(Tensor self, Tensor mat2) -> Tensor
+ dispatch:
+ CUDA: _int_mm_cuda
+
+- func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+ dispatch:
+ CUDA: _int_mm_out_cuda
+
- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
python_module: sparse
- func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor
python_module: sparse
@@ -3979,11 +4099,10 @@
dispatch:
CPU: batch_norm_cpu
CUDA: batch_norm_cuda
MPS: batch_norm_mps
MkldnnCPU: mkldnn_batch_norm
- tags: core
- func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
dispatch:
CUDA: batch_norm_cuda_out
MPS: batch_norm_mps_out
@@ -3995,11 +4114,21 @@
CPU: _batch_norm_legit_cpu
CUDA: _batch_norm_legit_cuda
MPS: _batch_norm_legit_mps
MkldnnCPU: _mkldnn_batch_norm_legit
autogen: _native_batch_norm_legit_functional
+ tags: core
+# HACK: identical to _native_batch_norm_legit, but training is known to be False,
+# So we known that running stats will not be mutated.
+# The real fix here is batch norm consolidation.
+- func: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+ dispatch:
+ CompositeExplicitAutograd: _batch_norm_legit_no_training
+ autogen: _native_batch_norm_legit_no_training.out
+ tags: core
+
- func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))
dispatch:
CPU: _batch_norm_legit_cpu_out
CUDA: _batch_norm_legit_cuda_out
MPS: _batch_norm_legit_mps_out
@@ -4053,11 +4182,11 @@
- func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
dispatch:
CUDA: batch_norm_backward_reduce_cuda
autogen: batch_norm_backward_reduce.out
-- func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor mean_dy, Tensor mean_dy_xmu, Tensor count) -> Tensor
+- func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
dispatch:
CUDA: batch_norm_backward_elemt_cuda
autogen: batch_norm_backward_elemt.out
- func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor)
@@ -4111,10 +4240,11 @@
- func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
dispatch:
CPU, CUDA: _cdist_forward
MPS: _cdist_forward_mps
autogen: _cdist_forward.out
+ tags: core
- func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
dispatch:
CPU, CUDA: _cdist_backward
autogen: _cdist_backward.out
@@ -4123,10 +4253,11 @@
- func: _pdist_forward(Tensor self, float p=2) -> Tensor
dispatch:
CPU, CUDA: _pdist_forward
autogen: _pdist_forward.out
+ tags: core
- func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
dispatch:
CPU, CUDA: _pdist_backward
autogen: _pdist_backward.out
@@ -4183,20 +4314,21 @@
- func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
dispatch:
CPU: pixel_shuffle_cpu
CompositeExplicitAutogradNonFunctional: math_pixel_shuffle
autogen: pixel_shuffle.out
+ tags: core
- func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
dispatch:
CPU: pixel_unshuffle_cpu
CompositeExplicitAutogradNonFunctional: math_pixel_unshuffle
autogen: pixel_unshuffle.out
- func: channel_shuffle(Tensor self, int groups) -> Tensor
dispatch:
- CPU: channel_shuffle
+ CPU, CUDA: channel_shuffle
QuantizedCPU: channel_shuffle_quantized_cpu
autogen: channel_shuffle.out
- func: native_channel_shuffle(Tensor self, int groups) -> Tensor
dispatch:
@@ -4292,11 +4424,11 @@
dispatch:
CompositeExplicitAutograd: rand
autogen: rand.generator_with_names_out
- func: rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
- tags: nondeterministic_seeded
+ tags: [core, nondeterministic_seeded]
dispatch:
CompositeExplicitAutograd: rand
- func: rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
tags: nondeterministic_seeded
@@ -4317,68 +4449,68 @@
# NB: Although this composite mutates on the inside, it is
# non-differentiable so NonFunctional doesn't apply
CompositeExplicitAutograd: rand_like
autogen: rand_like.out
-- func: randint(int high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
tags: nondeterministic_seeded
dispatch:
CompositeExplicitAutograd: randint
-- func: randint.generator(int high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
tags: nondeterministic_seeded
dispatch:
CompositeExplicitAutograd: randint
-- func: randint.low(int low, int high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
tags: nondeterministic_seeded
dispatch:
CompositeExplicitAutograd: randint
-- func: randint.low_generator(int low, int high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
tags: nondeterministic_seeded
dispatch:
CompositeExplicitAutograd: randint
-- func: randint.out(int high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+- func: randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
tags: nondeterministic_seeded
dispatch:
CompositeExplicitAutograd: randint_out
-- func: randint.generator_out(int high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+- func: randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
tags: nondeterministic_seeded
dispatch:
CompositeExplicitAutograd: randint_out
-- func: randint.low_out(int low, int high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+- func: randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
tags: nondeterministic_seeded
dispatch:
CompositeExplicitAutograd: randint_out
-- func: randint.low_generator_out(int low, int high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+- func: randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
tags: nondeterministic_seeded
dispatch:
CompositeExplicitAutograd: randint_out
-- func: randint_like(Tensor self, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+- func: randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
tags: nondeterministic_seeded
dispatch:
# NB: Although this composite mutates on the inside, it is
# non-differentiable so NonFunctional doesn't apply
CompositeExplicitAutograd: randint_like
autogen: randint_like.out
-- func: randint_like.low_dtype(Tensor self, int low, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+- func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
tags: nondeterministic_seeded
dispatch:
# NB: Although this composite mutates on the inside, it is
# non-differentiable so NonFunctional doesn't apply
CompositeExplicitAutograd: randint_like
autogen: randint_like.low_dtype_out
- func: randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
- tags: nondeterministic_seeded
+ tags: [core, nondeterministic_seeded]
dispatch:
CompositeExplicitAutograd: randn
- func: randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
tags: nondeterministic_seeded
@@ -4410,29 +4542,29 @@
- func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
tags: nondeterministic_seeded
dispatch:
# NB: Although this composite mutates on the inside, it is
# non-differentiable so NonFunctional doesn't apply
- CompositeExplicitAutograd: randn_like
+ CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: randn_like
autogen: randn_like.out
-- func: randperm(int n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
- tags: nondeterministic_seeded
+- func: randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+ tags: [core, nondeterministic_seeded]
dispatch:
CompositeExplicitAutograd: randperm
-- func: randperm.generator(int n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
tags: nondeterministic_seeded
dispatch:
CompositeExplicitAutograd: randperm
-- func: randperm.out(int n, *, Tensor(a!) out) -> Tensor(a!)
+- func: randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
tags: nondeterministic_seeded
dispatch:
CompositeExplicitAutograd: randperm_out
-- func: randperm.generator_out(int n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+- func: randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
tags: nondeterministic_seeded
dispatch:
CPU: randperm_out_cpu
CUDA: randperm_out_cuda
MPS: randperm_out_mps
@@ -4589,11 +4721,11 @@
structured_delegate: round.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA: round_sparse
SparseCsrCPU, SparseCsrCUDA: round_sparse_csr
- tags: pointwise
+ tags: [core, pointwise]
- func: round_(Tensor(a!) self) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured_delegate: round.out
variants: function, method
@@ -4837,14 +4969,18 @@
autogen: celu.out
- func: silu(Tensor self) -> Tensor
structured_delegate: silu.out
python_module: nn
+ dispatch:
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu
- func: silu_(Tensor(a!) self) -> Tensor(a!)
structured_delegate: silu.out
python_module: nn
+ dispatch:
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_
- func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
structured: True
structured_inherits: TensorIteratorBase
python_module: nn
@@ -4863,10 +4999,11 @@
- func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
structured_delegate: silu_backward.grad_input
python_module: nn
dispatch:
CompositeImplicitAutograd: math_silu_backward
+ NestedTensorCPU, NestedTensorCUDA: silu_backward_nested
- func: mish(Tensor self) -> Tensor
structured_delegate: mish.out
python_module: nn
@@ -4915,10 +5052,11 @@
- func: logit(Tensor self, float? eps=None) -> Tensor
variants: function, method
dispatch:
CPU, CUDA: logit
+ MPS: logit_mps
tags: pointwise
- func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
variants: function, method
dispatch:
@@ -4926,10 +5064,11 @@
tags: pointwise
- func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA: logit_out
+ MPS: logit_out_mps
tags: pointwise
- func: sin(Tensor self) -> Tensor
device_check: NoCheck # TensorIterator
structured_delegate: sin.out
@@ -5040,10 +5179,31 @@
- func: size.Dimname(Tensor self, Dimname dim) -> int
variants: function, method
device_check: NoCheck
device_guard: False
+- func: sym_size.int(Tensor self, int dim) -> SymInt
+ variants: function
+ device_check: NoCheck
+ device_guard: False
+ tags: core
+ manual_cpp_binding: True
+
+- func: sym_numel(Tensor self) -> SymInt
+ variants: function
+ device_check: NoCheck
+ device_guard: False
+ tags: core
+ manual_cpp_binding: True
+
+- func: sym_storage_offset(Tensor self) -> SymInt
+ variants: function
+ device_check: NoCheck
+ device_guard: False
+ tags: core
+ manual_cpp_binding: True
+
- func: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
@@ -5064,36 +5224,37 @@
- func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
- CompositeExplicitAutograd: slice_scatter
+ CompositeExplicitAutogradNonFunctional: slice_scatter
autogen: slice_scatter.out
tags: core
- func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
- CompositeExplicitAutograd: select_scatter_symint
+ CompositeExplicitAutogradNonFunctional: select_scatter_symint
autogen: select_scatter.out
+ tags: core
- func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
- CompositeExplicitAutograd: diagonal_scatter
+ CompositeExplicitAutogradNonFunctional: diagonal_scatter
autogen: diagonal_scatter.out
- func: as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
- CompositeExplicitAutograd: as_strided_scatter_symint
+ CompositeExplicitAutogradNonFunctional: as_strided_scatter_symint
autogen: as_strided_scatter.out
- func: smm(Tensor self, Tensor mat2) -> Tensor
variants: function, method
@@ -5168,10 +5329,12 @@
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
CompositeExplicitAutograd: split_with_sizes
+ NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested
+ tags: core
- func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
variants: function, method
- func: hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
@@ -5314,26 +5477,35 @@
- func: stride.Dimname(Tensor self, Dimname dim) -> int
variants: function, method
device_check: NoCheck
device_guard: False
+- func: sym_stride.int(Tensor self, int dim) -> SymInt
+ variants: function
+ device_check: NoCheck
+ device_guard: False
+ tags: core
+ manual_cpp_binding: True
+
- func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: sum
SparseCPU, SparseCUDA: sum_coo
SparseCsrCPU, SparseCsrCUDA: sum_csr
autogen: sum.out
- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+ # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
structured_delegate: sum.IntList_out
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
NestedTensorCPU: NestedTensor_sum_dim_CPU
SparseCPU, SparseCUDA: sum_sparse_coo
+ SparseCsrCPU, SparseCsrCUDA: sum_sparse_compressed
tags: core
- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
@@ -5362,14 +5534,16 @@
- func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA: nansum_out
MPS: nansum_out_mps
-- func: sum_to_size(Tensor self, int[] size) -> Tensor
+- func: sum_to_size(Tensor self, SymInt[] size) -> Tensor
variants: method
device_check: NoCheck
device_guard: False
+ dispatch:
+ CompositeImplicitAutograd: sum_to_size_symint
- func: sqrt(Tensor self) -> Tensor
device_check: NoCheck # TensorIterator
structured_delegate: sqrt.out
variants: function, method
@@ -5419,11 +5593,11 @@
- func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
-- func: std.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
+- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CPU, CUDA: std
MPS: std_mps
@@ -5437,11 +5611,11 @@
- func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
-- func: std_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
dispatch:
CPU, CUDA: std_mean
autogen: std_mean.correction_out
@@ -5449,19 +5623,19 @@
- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
-- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
- func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
-- func: std.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA: std_out
QuantizedCPU: std_out_quantized_cpu
@@ -5472,30 +5646,32 @@
- func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
-- func: std.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor
+- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
-- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CPU, CUDA: prod
MPS: prod_mps
autogen: prod.out
+ tags: core
- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
structured_delegate: prod.int_out
device_check: NoCheck # TensorIterator
variants: function, method
+ tags: core
- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
structured: True
device_check: NoCheck # TensorIterator
dispatch:
@@ -5529,11 +5705,11 @@
structured_delegate: tan.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA: tan_sparse
SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr
- tags: pointwise
+ tags: [core, pointwise]
- func: tan_(Tensor(a!) self) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured_delegate: tan.out
variants: function, method
@@ -5590,12 +5766,10 @@
- func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
variants: function
- func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!)
variants: function
- dispatch:
- CPU, CUDA: tensordot_out
# TODO: namespace threshold in 'nn'
- func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
device_check: NoCheck # TensorIterator
variants: function
@@ -5633,12 +5807,14 @@
SparseCPU, SparseCUDA: threshold_backward_sparse
SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed
NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
tags: pointwise
-- func: tile(Tensor self, int[] dims) -> Tensor
+- func: tile(Tensor self, SymInt[] dims) -> Tensor
variants: function, method
+ dispatch:
+ CompositeImplicitAutograd: tile_symint
- func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
variants: function, method
device_check: NoCheck
device_guard: False
@@ -5689,16 +5865,17 @@
variants: function, method
- func: flipud(Tensor self) -> Tensor
variants: function, method
-- func: roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor
+- func: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
variants: function, method
dispatch:
- CPU: roll_cpu
+ CPU, MPS: roll
CUDA: roll_cuda
autogen: roll.out
+ tags: core
# default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
- func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
variants: function, method
@@ -5748,31 +5925,32 @@
variants: method
dispatch:
NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
autogen: _nested_tensor_strides.out
-- func: _nested_tensor_offsets(Tensor self) -> int[]
+- func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
variants: method
dispatch:
- NestedTensorCPU, NestedTensorCUDA: _nested_tensor_offsets
+ NestedTensorCPU, NestedTensorCUDA, NestedTensorMeta: _nested_tensor_storage_offsets
+ autogen: _nested_tensor_storage_offsets.out
# _nested_from_padded is not usable from Python, so
# _nested_from_padded_and_nested_example is available for testing.
- func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
dispatch:
NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
autogen: _nested_from_padded_and_nested_example.out
# The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
# this will need to be updated
-- func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor(a)
+- func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a)
variants: function
device_check: NoCheck
dispatch:
CPU, CUDA: _nested_view_from_buffer
-- func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor
+- func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor
variants: function
device_check: NoCheck
tags: view_copy
dispatch:
CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy
@@ -5911,22 +6089,23 @@
device_check: NoCheck # TensorIterator
variants: function, method
tags: core
cpp_no_default_args: ["unbiased"]
-- func: var.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
+- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CPU, CUDA: var
MPS: var_mps
+ tags: core
- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
-- func: var.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA: var_out
- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
@@ -5936,15 +6115,15 @@
- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
-- func: var.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor
+- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
-- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
@@ -5954,11 +6133,11 @@
- func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
-- func: var_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
dispatch:
CPU, CUDA: var_mean
autogen: var_mean.correction_out
@@ -5966,11 +6145,11 @@
- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
-- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
variants: method
@@ -6034,11 +6213,11 @@
device_guard: False
dispatch:
CompositeExplicitAutograd: zeros
autogen: zeros.names_out
-- func: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
CPU: _efficientzerotensor
CUDA: _efficientzerotensor_cuda
Meta: _efficientzerotensor_meta
autogen: _efficientzerotensor.out
@@ -6054,11 +6233,11 @@
- func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
dispatch:
# NB: Although this composite mutates on the inside, it is
# non-differentiable so NonFunctional doesn't apply
- CompositeExplicitAutograd: zeros_like
+ CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: zeros_like
autogen: zeros_like.out
- func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
variants: function
dispatch:
@@ -6295,11 +6474,11 @@
SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed
MkldnnCPU: mkldnn_clone
QuantizedCPU, QuantizedCUDA: quantized_clone
NestedTensorCPU, NestedTensorCUDA: clone_nested
autogen: clone.out
- tags: core
+ tags: [core, pointwise]
- func: positive(Tensor(a) self) -> Tensor(a)
variants: function, method
tags: pointwise
@@ -6307,10 +6486,11 @@
use_const_ref_for_mutable_tensors: True
variants: function, method
dispatch:
CompositeExplicitAutograd: resize_as_
autogen: resize_as, resize_as.out
+ tags: inplace_view
- func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
use_const_ref_for_mutable_tensors: True
variants: function, method
dispatch:
@@ -6326,10 +6506,11 @@
MPS: zero_mps_
Meta: zero_meta_
SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_
MkldnnCPU: mkldnn_zero_
+ NestedTensorCPU, NestedTensorCUDA: zero_nested_
autogen: zero, zero.out
- func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured: True
@@ -6345,10 +6526,11 @@
variants: function, method
structured_delegate: sub.out
dispatch:
SparseCPU, SparseCUDA: sub_sparse
ZeroTensor: sub_zerotensor
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor
tags: [core, pointwise]
- func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: method
@@ -6491,10 +6673,20 @@
- func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
structured_delegate: _addmm_activation.out
variants: function, method
+- func: _scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None) -> (Tensor, Tensor)
+ variants: function
+ dispatch:
+ CUDA: _scaled_mm_cuda
+
+- func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!))
+ variants: function
+ dispatch:
+ CUDA: _scaled_mm_out_cuda
+
# NOTE [ Sparse: autograd and API ]
#
#
# Sparse Tensor Constructors
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -6603,16 +6795,21 @@
# sparse tensor.
# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
# the default would never make sense.
- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+ dispatch:
+ CompositeExplicitAutograd: sparse_compressed_tensor
+
- func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+ dispatch:
+ CompositeExplicitAutograd: sparse_compressed_tensor
- func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
@@ -6625,19 +6822,19 @@
- func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
dispatch:
CompositeExplicitAutograd: sparse_coo_tensor
autogen: sparse_coo_tensor.size_out
-- func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
-- func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
-- func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
dispatch:
CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint
-- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> ()
+- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> ()
- func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
- func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
- func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
- func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
@@ -6646,11 +6843,11 @@
- func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
dispatch:
SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_sparse
autogen: _sparse_coo_tensor_with_dims.out
-- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
dispatch:
SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse_symint
autogen: _sparse_coo_tensor_with_dims_and_tensors.out
- func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
@@ -6669,29 +6866,35 @@
- func: sparse_mask(Tensor self, Tensor mask) -> Tensor
variants: method
dispatch:
SparseCPU, SparseCUDA: sparse_mask
- SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_csr
+ SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_compressed
autogen: sparse_mask.out
+- func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor
+ variants: method
+ dispatch:
+ SparseCPU, SparseCUDA: sparse_mask_projection
+ autogen: _sparse_mask_projection.out
+
- func: _to_cpu(Tensor[] tensors) -> Tensor[]
variants: function
-- func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
+- func: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor
variants: method
# Special case of to_dense with custom derivative
-- func: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
+- func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
variants: method
dispatch:
SparseCPU, SparseCUDA: sparse_to_dense
SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense
MkldnnCPU: mkldnn_to_dense
autogen: _to_dense.out
-- func: to_dense_backward(Tensor grad, Tensor input) -> Tensor
+- func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor
- func: sparse_dim(Tensor self) -> int
variants: method
dispatch:
CPU, CUDA: sparse_dim_strided
@@ -6857,56 +7060,85 @@
- func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
variants: function, method
- func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
variants: method
+
+# Special case of to_sparse.sparse_dim with custom derivative
+- func: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+ variants: method
dispatch:
CPU, CUDA: dense_to_sparse
SparseCPU, SparseCUDA: sparse_coo_to_sparse
SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
- autogen: to_sparse.sparse_dim_out
+ autogen: _to_sparse.sparse_dim_out
- func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
variants: method
+
+# Special case of to_sparse with custom derivative
+- func: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+ variants: method
dispatch:
CPU, CUDA: dense_to_sparse
SparseCPU, SparseCUDA: sparse_coo_to_sparse
SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
- autogen: to_sparse.out
+ autogen: _to_sparse.out
- func: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
variants: method
+
+# Special case of to_sparse_csr with custom derivative
+- func: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+ variants: method
dispatch:
CPU, CUDA: dense_to_sparse_csr
SparseCPU, SparseCUDA: coo_to_sparse_csr
SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr
- autogen: to_sparse_csr.out
+ autogen: _to_sparse_csr.out
- func: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
variants: method
+
+# Special case of to_sparse_csc with custom derivative
+- func: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+ variants: method
dispatch:
CPU, CUDA: dense_to_sparse_csc
SparseCPU, SparseCUDA: coo_to_sparse_csc
SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc
- autogen: to_sparse_csc.out
+ autogen: _to_sparse_csc.out
- func: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
variants: method
+
+# Special case of to_sparse_bsr with custom derivative
+- func: _to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+ variants: method
dispatch:
CPU, CUDA: dense_to_sparse_bsr
SparseCPU, SparseCUDA: coo_to_sparse_bsr
SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr
- autogen: to_sparse_bsr.out
+ autogen: _to_sparse_bsr.out
- func: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
variants: method
+
+# Special case of to_sparse_bsc with custom derivative
+- func: _to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+ variants: method
dispatch:
CPU, CUDA: dense_to_sparse_bsc
SparseCPU, SparseCUDA: coo_to_sparse_bsc
SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc
- autogen: to_sparse_bsc.out
+ autogen: _to_sparse_bsc.out
+- func: _to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor)
+ variants: function
+ dispatch:
+ CUDA: _to_sparse_semi_structured
+
- func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
variants: method
dispatch:
CPU: dense_to_mkldnn
autogen: to_mkldnn.out
@@ -7172,11 +7404,11 @@
- func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
variants: function
# NB: Does NOT check precondition that numel == 1
- func: _local_scalar_dense(Tensor self) -> Scalar
- tags: data_dependent_output
+ tags: [core, data_dependent_output]
dispatch:
CPU: _local_scalar_dense_cpu
CUDA: _local_scalar_dense_cuda
MPS: _local_scalar_dense_mps
variants: function
@@ -7185,12 +7417,13 @@
- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
dispatch:
MPS: _lstm_mps
autogen: _lstm_mps.out
+ tags: nondeterministic_seeded
-- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+- func: lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
dispatch:
MPS: lstm_mps_backward
autogen: lstm_mps_backward.out
@@ -7224,24 +7457,32 @@
- func: _thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
# RNN cells and layers
- func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
+ tags: nondeterministic_seeded
- func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
+ tags: nondeterministic_seeded
- func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+ tags: nondeterministic_seeded
- func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+ tags: nondeterministic_seeded
- func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+ tags: nondeterministic_seeded
- func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+ tags: nondeterministic_seeded
- func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+ tags: nondeterministic_seeded
- func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+ tags: nondeterministic_seeded
- func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
- func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
@@ -7380,10 +7621,11 @@
- func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: masked_fill
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill
tags: pointwise
- func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: method
@@ -7404,10 +7646,11 @@
- func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)
variants: method
dispatch:
CPU: masked_scatter__cpu
CUDA: masked_scatter__cuda
+ MPS: masked_scatter__mps
autogen: masked_scatter.out
- func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
variants: function, method
dispatch:
@@ -7501,10 +7744,11 @@
device_check: NoCheck # TensorIterator
variants: method
dispatch:
CPU: index_fill_
CUDA: index_fill_
+ MPS: index_fill_mps_
autogen: index_fill.int_Scalar_out
- func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
@@ -7514,10 +7758,11 @@
- func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: method
dispatch:
CPU, CUDA: index_fill_
+ MPS: index_fill_mps_
autogen: index_fill.int_Tensor_out
- func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
@@ -7541,10 +7786,11 @@
variants: function, method
- func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
structured_delegate: scatter.src_out
variants: function, method
+ tags: core
- func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
structured_delegate: scatter.src_out
variants: method
@@ -7556,10 +7802,11 @@
MPS: scatter_src_out_mps
- func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
structured_delegate: scatter.value_out
variants: function, method
+ tags: core
- func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
structured_delegate: scatter.value_out
variants: method
@@ -7655,10 +7902,11 @@
structured: True
structured_inherits: TensorIteratorBase
variants: function
dispatch:
CPU, CUDA: bitwise_and_out
+ MPS: bitwise_and_out_mps
tags: pointwise
- func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
@@ -7669,11 +7917,11 @@
- func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
device_check: NoCheck # TensorIterator
variants: method, function
dispatch:
CompositeExplicitAutograd: bitwise_and
- tags: pointwise
+ tags: [core, pointwise]
- func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
device_check: NoCheck # TensorIterator
variants: function
dispatch:
@@ -7719,10 +7967,11 @@
structured: True
structured_inherits: TensorIteratorBase
variants: function
dispatch:
CPU, CUDA: bitwise_or_out
+ MPS: bitwise_or_out_mps
tags: pointwise
- func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
@@ -7731,11 +7980,11 @@
tags: pointwise
- func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
device_check: NoCheck # TensorIterator
variants: method, function
- tags: pointwise
+ tags: [core, pointwise]
- func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
device_check: NoCheck # TensorIterator
variants: function
dispatch:
@@ -7781,10 +8030,11 @@
structured: True
structured_inherits: TensorIteratorBase
variants: function
dispatch:
CPU, CUDA: bitwise_xor_out
+ MPS: bitwise_xor_out_mps
tags: pointwise
- func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
@@ -7793,11 +8043,11 @@
tags: pointwise
- func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
device_check: NoCheck # TensorIterator
variants: method, function
- tags: pointwise
+ tags: [core, pointwise]
- func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
device_check: NoCheck # TensorIterator
variants: function
dispatch:
@@ -8065,10 +8315,11 @@
device_check: NoCheck # TensorIterator
tags: nondeterministic_seeded
variants: method
dispatch:
CPU, CUDA: random_
+ MPS: random_mps_
Meta: random_meta_
autogen: random, random.out
- func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@@ -8162,11 +8413,11 @@
- func: trace(Tensor self) -> Tensor
variants: method, function
dispatch:
CPU: trace_cpu
CUDA: trace_cuda
- MPS: trace_mps_out
+ MPS: trace_mps
autogen: trace.out
- func: trace_backward(Tensor grad, SymInt[] sizes) -> Tensor
variants: function
device_check: NoCheck
@@ -8602,10 +8853,19 @@
CPU: nonzero_cpu
CUDA: nonzero_cuda
MPS: nonzero_mps
tags: [dynamic_output_shape, core]
+- func: nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
+ dispatch:
+ CPU: nonzero_static_out_cpu
+
+- func: nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor
+ variants: method, function
+ dispatch:
+ CPU: nonzero_static_cpu
+
- func: nonzero_numpy(Tensor self) -> Tensor[]
variants: method, function
- func: argwhere(Tensor self) -> Tensor
variants: method, function
@@ -8708,12 +8968,14 @@
variants: function
dispatch:
CPU, CUDA: linalg_solve_triangular
MPS: linalg_solve_triangular_mps
-- func: linalg_vander(Tensor x, *, int? N=None) -> Tensor
+- func: linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor
python_module: linalg
+ dispatch:
+ CompositeImplicitAutograd: linalg_vander_symint
- func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)
- func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)
variants: method, function
@@ -8915,10 +9177,11 @@
device_check: NoCheck # TensorIterator
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: erfinv_out
+ MPS: erfinv_out_mps
SparseCPU, SparseCUDA: erfinv_sparse_out
SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out
tags: pointwise
- func: i0(Tensor self) -> Tensor
@@ -8997,11 +9260,11 @@
device_check: NoCheck # TensorIterator
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: atan2_out
- MPS: atan2_mps_out
+ MPS: atan2_out_mps
tags: pointwise
- func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured_delegate: atan2.out
@@ -9028,18 +9291,20 @@
device_check: NoCheck # TensorIterator
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: lerp_Scalar
+ MPS: lerp_Scalar_mps
tags: pointwise
- func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: lerp_Tensor
+ MPS: lerp_Tensor_mps
tags: pointwise
- func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
device_check: NoCheck # TensorIterator
variants: method, function
@@ -9052,50 +9317,50 @@
structured_delegate: lerp.Tensor_out
tags: pointwise
- func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
- CPU: histogram_histc_cpu_out
+ CPU, MPS: histogram_histc_out
CUDA: _histc_out_cuda
- func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
variants: method, function
dispatch:
- CPU: histogram_histc_cpu
+ CPU, MPS: histogram_histc
CUDA: _histc_cuda
- func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
dispatch:
- CPU: histogram_out_cpu
+ CPU, MPS: histogram_out
- func: histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
variants: method, function
dispatch:
- CPU: histogram_cpu
+ CPU, MPS: histogram
- func: histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
dispatch:
- CPU: histogram_out_cpu
+ CPU, MPS: histogram_out
- func: histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
variants: method, function
dispatch:
- CPU: histogram_cpu
+ CPU, MPS: histogram
- func: _histogramdd_bin_edges(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor[]
dispatch:
- CPU: histogramdd_bin_edges_cpu
+ CPU, MPS: histogramdd_bin_edges
autogen: _histogramdd_bin_edges.out
- func: _histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor
dispatch:
- CPU: histogramdd_cpu
+ CPU, MPS: _histogramdd
autogen: _histogramdd_from_bin_cts.out
- func: _histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor
dispatch:
- CPU: histogramdd_cpu
+ CPU, MPS: _histogramdd
autogen: _histogramdd_from_bin_tensors.out
- func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
- func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
@@ -9111,11 +9376,11 @@
- func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
device_check: NoCheck # TensorIterator
variants: method, function
dispatch:
CompositeExplicitAutograd: fmod
- tags: pointwise
+ tags: [core, pointwise]
- func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: method
dispatch:
@@ -9146,10 +9411,11 @@
- func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: hypot_out
+ MPS: hypot_out_mps
tags: pointwise
- func: hypot(Tensor self, Tensor other) -> Tensor
structured_delegate: hypot.out
variants: method, function
@@ -9218,11 +9484,11 @@
- func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
variants: method, function
dispatch:
CompositeExplicitAutograd: remainder
- tags: pointwise
+ tags: [core, pointwise]
- func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
variants: method
dispatch:
CompositeExplicitAutograd: remainder_
@@ -9263,16 +9529,15 @@
dispatch:
CPU, CUDA: min
MPS: min_mps
QuantizedCPU: min_quantized_cpu
-# Not to be confused with binary op `min.out`. Commented because of failed CI
-# FIXME: enable this
-#- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-# device_check: NoCheck # TensorIterator
-# dispatch:
-# CompositeExplicitAutograd: min_unary_out
+- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+ device_check: NoCheck # TensorIterator
+ dispatch:
+ CPU, CUDA: min_unary_out
+ QuantizedCPU: min_quantized_unary_out
- func: fmin(Tensor self, Tensor other) -> Tensor
structured_delegate: fmin.out
device_check: NoCheck # TensorIterator
variants: method, function
@@ -9281,11 +9546,11 @@
- func: fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
structured: True
structured_inherits: TensorIteratorBase
device_check: NoCheck # TensorIterator
dispatch:
- CPU, CUDA: fmin_out
+ CPU, CUDA, MPS: fmin_out
tags: pointwise
- func: max(Tensor self) -> Tensor
device_check: NoCheck # TensorIterator
variants: method, function
@@ -9303,11 +9568,11 @@
- func: fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
structured: True
structured_inherits: TensorIteratorBase
device_check: NoCheck # TensorIterator
dispatch:
- CPU, CUDA: fmax_out
+ CPU, CUDA, MPS: fmax_out
tags: pointwise
- func: maximum(Tensor self, Tensor other) -> Tensor
structured_delegate: maximum.out
device_check: NoCheck # TensorIterator
@@ -9400,10 +9665,11 @@
- func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
device_check: NoCheck # TensorIterator
variants: method, function
dispatch:
CompositeExplicitAutograd: sort
+ tags: core
- func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
structured_delegate: sort.values_stable
variants: method, function
dispatch:
@@ -9436,18 +9702,18 @@
autogen: argsort.stable_out
- func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
variants: method, function
-- func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+- func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
structured: True
dispatch:
CPU: topk_out_cpu
CUDA: topk_out_cuda
MPS: topk_out_mps
-- func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+- func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
variants: method, function
structured_delegate: topk.values
dispatch:
QuantizedCPU: topk_quantized_cpu
tags: core
@@ -9468,10 +9734,11 @@
device_check: NoCheck # TensorIterator
structured_delegate: any.all_out
variants: method, function
dispatch:
SparseCPU, SparseCUDA: any_sparse
+ tags: core
- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck
structured: True
dispatch:
@@ -9481,10 +9748,11 @@
- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured: True
dispatch:
CPU, CUDA: renorm_out
+ MPS: renorm_out_mps
- func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
device_check: NoCheck # TensorIterator
variants: method, function
structured_delegate: renorm.out
@@ -9535,10 +9803,11 @@
- func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured: True
dispatch:
CPU, CUDA: pow_Scalar_out
+ MPS: pow_Scalar_out_mps
tags: pointwise
- func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
device_check: NoCheck # TensorIterator
structured_delegate: pow.Scalar_out
@@ -9609,10 +9878,11 @@
dispatch:
CPU, CUDA: normal_
MPS: normal_mps_
Meta: normal_meta_
SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_
+ NestedTensorCPU, NestedTensorCUDA: normal_nested_
autogen: normal.out
# Only used by the functionalization pass.
# Normally, the codegen would be able to generate a normal() NativeFunction,
# but we can't due to overload ambiguity with normal.Tensor_float.
@@ -9718,160 +9988,159 @@
dispatch:
CPU: foreach_tensor_add_scalar_kernel_slow_
CUDA: foreach_tensor_add_scalar_kernel_cuda_
autogen: _foreach_add.Scalar_out
-- func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_scalar_kernel_slow
- CUDA: foreach_tensor_sub_scalar_kernel_cuda
+ CPU: foreach_tensor_add_list_kernel_slow
+ CUDA: foreach_tensor_add_list_kernel_cuda
-- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_scalar_kernel_slow_
- CUDA: foreach_tensor_sub_scalar_kernel_cuda_
- autogen: _foreach_sub.Scalar_out
+ CPU: foreach_tensor_add_list_kernel_slow_
+ CUDA: foreach_tensor_add_list_kernel_cuda_
+ autogen: _foreach_add.List_out
-- func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+- func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_scalar_kernel_slow
- CUDA: foreach_tensor_mul_scalar_kernel_cuda
+ CPU: foreach_tensor_add_scalarlist_kernel_slow
+ CUDA: foreach_tensor_add_scalarlist_kernel_cuda
-- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+- func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_scalar_kernel_slow_
- CUDA: foreach_tensor_mul_scalar_kernel_cuda_
- autogen: _foreach_mul.Scalar_out
+ CPU: foreach_tensor_add_scalarlist_kernel_slow_
+ CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
+ autogen: _foreach_add.ScalarList_out
-- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+- func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_scalar_kernel_slow
- CUDA: foreach_tensor_div_scalar_kernel_cuda
+ CPU: foreach_tensor_sub_scalar_kernel_slow
+ CUDA: foreach_tensor_sub_scalar_kernel_cuda
-- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_scalar_kernel_slow_
- CUDA: foreach_tensor_div_scalar_kernel_cuda_
- autogen: _foreach_div.Scalar_out
+ CPU: foreach_tensor_sub_scalar_kernel_slow_
+ CUDA: foreach_tensor_sub_scalar_kernel_cuda_
+ autogen: _foreach_sub.Scalar_out
-- func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+- func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
+ CPU: foreach_tensor_sub_list_kernel_slow
+ CUDA: foreach_tensor_sub_list_kernel_cuda
-- func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
- autogen: _foreach_clamp_min.Scalar_out
+ CPU: foreach_tensor_sub_list_kernel_slow_
+ CUDA: foreach_tensor_sub_list_kernel_cuda_
+ autogen: _foreach_sub.List_out
-- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+- func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
+ CPU: foreach_tensor_sub_scalarlist_kernel_slow
+ CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
-- func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
- autogen: _foreach_clamp_max.Scalar_out
+ CPU: foreach_tensor_sub_scalarlist_kernel_slow_
+ CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
+ autogen: _foreach_sub.ScalarList_out
-# foreach_minimum/maximum dispatches to clamp_max/min
-- func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+- func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
+ CPU: foreach_tensor_mul_scalar_kernel_slow
+ CUDA: foreach_tensor_mul_scalar_kernel_cuda
-- func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
- autogen: _foreach_maximum.Scalar_out
+ CPU: foreach_tensor_mul_scalar_kernel_slow_
+ CUDA: foreach_tensor_mul_scalar_kernel_cuda_
+ autogen: _foreach_mul.Scalar_out
-- func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
+ CPU: foreach_tensor_mul_list_kernel_slow
+ CUDA: foreach_tensor_mul_list_kernel_cuda
-- func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
- autogen: _foreach_minimum.Scalar_out
+ CPU: foreach_tensor_mul_list_kernel_slow_
+ CUDA: foreach_tensor_mul_list_kernel_cuda_
+ autogen: _foreach_mul.List_out
-- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
+- func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_list_kernel_slow
- CUDA: foreach_tensor_add_list_kernel_cuda
+ CPU: foreach_tensor_mul_scalarlist_kernel_slow
+ CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
-- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_list_kernel_slow_
- CUDA: foreach_tensor_add_list_kernel_cuda_
- autogen: _foreach_add.List_out
+ CPU: foreach_tensor_mul_scalarlist_kernel_slow_
+ CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
+ autogen: _foreach_mul.ScalarList_out
-- func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
+- func: _foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_list_kernel_slow
- CUDA: foreach_tensor_sub_list_kernel_cuda
+ CPU: foreach_tensor_mul_tensor_kernel_slow
+ CUDA: foreach_tensor_mul_tensor_kernel_cuda
-- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+- func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_list_kernel_slow_
- CUDA: foreach_tensor_sub_list_kernel_cuda_
- autogen: _foreach_sub.List_out
+ CPU: foreach_tensor_mul_tensor_kernel_slow_
+ CUDA: foreach_tensor_mul_tensor_kernel_cuda_
+ autogen: _foreach_mul.Tensor_out
-- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
+- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_list_kernel_slow
- CUDA: foreach_tensor_mul_list_kernel_cuda
+ CPU: foreach_tensor_div_scalar_kernel_slow
+ CUDA: foreach_tensor_div_scalar_kernel_cuda
-- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_list_kernel_slow_
- CUDA: foreach_tensor_mul_list_kernel_cuda_
- autogen: _foreach_mul.List_out
+ CPU: foreach_tensor_div_scalar_kernel_slow_
+ CUDA: foreach_tensor_div_scalar_kernel_cuda_
+ autogen: _foreach_div.Scalar_out
- func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
@@ -9884,25 +10153,40 @@
dispatch:
CPU: foreach_tensor_div_list_kernel_slow_
CUDA: foreach_tensor_div_list_kernel_cuda_
autogen: _foreach_div.List_out
-- func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
+- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_list_kernel_slow
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda
+ CPU: foreach_tensor_div_scalarlist_kernel_slow
+ CUDA: foreach_tensor_div_scalarlist_kernel_cuda
-- func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+- func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
- autogen: _foreach_clamp_min.List_out
+ CPU: foreach_tensor_div_scalarlist_kernel_slow_
+ CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
+ autogen: _foreach_div.ScalarList_out
+- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
+
+- func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
+ autogen: _foreach_clamp_max.Scalar_out
+
- func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
CPU: foreach_tensor_clamp_max_list_kernel_slow
@@ -9914,147 +10198,147 @@
dispatch:
CPU: foreach_tensor_clamp_max_list_kernel_slow_
CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
autogen: _foreach_clamp_max.List_out
-# foreach_minimum/maximum dispatches to clamp_max/min
-- func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
+- func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_list_kernel_slow
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda
+ CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
+ CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
-- func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+- func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
- autogen: _foreach_maximum.List_out
+ CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
+ CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
+ autogen: _foreach_clamp_max.ScalarList_out
-- func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
+- func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_list_kernel_slow
- CUDA: foreach_tensor_clamp_max_list_kernel_cuda
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
-- func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+- func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
- CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
- autogen: _foreach_minimum.List_out
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+ autogen: _foreach_clamp_min.Scalar_out
-
-- func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+- func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_scalarlist_kernel_slow
- CUDA: foreach_tensor_add_scalarlist_kernel_cuda
+ CPU: foreach_tensor_clamp_min_list_kernel_slow
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda
-- func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+- func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_scalarlist_kernel_slow_
- CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
- autogen: _foreach_add.ScalarList_out
+ CPU: foreach_tensor_clamp_min_list_kernel_slow_
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
+ autogen: _foreach_clamp_min.List_out
-- func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+- func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_scalarlist_kernel_slow
- CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
+ CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
+ CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
-- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+- func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_scalarlist_kernel_slow_
- CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
- autogen: _foreach_sub.ScalarList_out
+ CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
+ CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
+ autogen: _foreach_clamp_min.ScalarList_out
-- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+# foreach_minimum/maximum dispatches to clamp_max/min
+- func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_scalarlist_kernel_slow
- CUDA: foreach_tensor_div_scalarlist_kernel_cuda
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
-- func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+- func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_scalarlist_kernel_slow_
- CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
- autogen: _foreach_div.ScalarList_out
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+ autogen: _foreach_maximum.Scalar_out
-- func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+# foreach_minimum/maximum dispatches to clamp_max/min
+- func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_scalarlist_kernel_slow
- CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
+ CPU: foreach_tensor_clamp_min_list_kernel_slow
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda
-- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+- func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_scalarlist_kernel_slow_
- CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
- autogen: _foreach_mul.ScalarList_out
+ CPU: foreach_tensor_clamp_min_list_kernel_slow_
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
+ autogen: _foreach_maximum.List_out
-- func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+# foreach_minimum/maximum dispatches to clamp_max/min
+- func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
-- func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+- func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
- autogen: _foreach_clamp_min.ScalarList_out
+ autogen: _foreach_maximum.ScalarList_out
-- func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+- func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
- CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
-- func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+- func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
- CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
- autogen: _foreach_clamp_max.ScalarList_out
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
+ autogen: _foreach_minimum.Scalar_out
-# foreach_minimum/maximum dispatches to clamp_max/min
-- func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+- func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
- CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
+ CPU: foreach_tensor_clamp_max_list_kernel_slow
+ CUDA: foreach_tensor_clamp_max_list_kernel_cuda
-- func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+- func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
- CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
- autogen: _foreach_maximum.ScalarList_out
+ CPU: foreach_tensor_clamp_max_list_kernel_slow_
+ CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
+ autogen: _foreach_minimum.List_out
- func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
@@ -10067,48 +10351,100 @@
dispatch:
CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
autogen: _foreach_minimum.ScalarList_out
-- func: _foreach_exp(Tensor[] self) -> Tensor[]
+- func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_exp_slow
- CUDA: foreach_tensor_exp_cuda
+ CPU: foreach_tensor_addcdiv_scalar_slow
+ CUDA: foreach_tensor_addcdiv_scalar_cuda
-- func: _foreach_zero_(Tensor(a!)[] self) -> ()
+- func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_zero_slow_
- CUDA: foreach_tensor_zero_cuda_
- autogen: _foreach_zero, _foreach_zero.out
+ CPU: foreach_tensor_addcdiv_scalarlist_slow
+ CUDA: foreach_tensor_addcdiv_scalarlist_cuda
-- func: _foreach_exp_(Tensor(a!)[] self) -> ()
+- func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_exp_slow_
- CUDA: foreach_tensor_exp_cuda_
- autogen: _foreach_exp.out
+ CPU: foreach_tensor_addcdiv_tensor_slow
+ CUDA: foreach_tensor_addcdiv_tensor_cuda
-- func: _foreach_sqrt(Tensor[] self) -> Tensor[]
+- func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sqrt_slow
- CUDA: foreach_tensor_sqrt_cuda
+ CPU: foreach_tensor_addcdiv_scalar_slow_
+ CUDA: foreach_tensor_addcdiv_scalar_cuda_
+ autogen: _foreach_addcdiv.Scalar_out
-- func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
+- func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sqrt_slow_
- CUDA: foreach_tensor_sqrt_cuda_
- autogen: _foreach_sqrt.out
+ CPU: foreach_tensor_addcdiv_scalarlist_slow_
+ CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
+ autogen: _foreach_addcdiv.ScalarList_out
+- func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_addcdiv_tensor_slow_
+ CUDA: foreach_tensor_addcdiv_tensor_cuda_
+ autogen: _foreach_addcdiv.Tensor_out
+
+- func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_addcmul_scalar_slow
+ CUDA: foreach_tensor_addcmul_scalar_cuda
+
+- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_addcmul_scalarlist_slow
+ CUDA: foreach_tensor_addcmul_scalarlist_cuda
+
+- func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_addcmul_tensor_slow
+ CUDA: foreach_tensor_addcmul_tensor_cuda
+
+- func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_addcmul_scalar_slow_
+ CUDA: foreach_tensor_addcmul_scalar_cuda_
+ autogen: _foreach_addcmul.Scalar_out
+
+- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_addcmul_scalarlist_slow_
+ CUDA: foreach_tensor_addcmul_scalarlist_cuda_
+ autogen: _foreach_addcmul.ScalarList_out
+
+- func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_addcmul_tensor_slow_
+ CUDA: foreach_tensor_addcmul_tensor_cuda_
+ autogen: _foreach_addcmul.Tensor_out
+
- func: _foreach_abs(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
CPU: foreach_tensor_abs_slow
@@ -10240,10 +10576,25 @@
dispatch:
CPU: foreach_tensor_erfc_slow_
CUDA: foreach_tensor_erfc_cuda_
autogen: _foreach_erfc.out
+- func: _foreach_exp(Tensor[] self) -> Tensor[]
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_exp_slow
+ CUDA: foreach_tensor_exp_cuda
+
+- func: _foreach_exp_(Tensor(a!)[] self) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_exp_slow_
+ CUDA: foreach_tensor_exp_cuda_
+ autogen: _foreach_exp.out
+
- func: _foreach_expm1(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
CPU: foreach_tensor_expm1_slow
@@ -10270,10 +10621,72 @@
dispatch:
CPU: foreach_tensor_floor_slow_
CUDA: foreach_tensor_floor_cuda_
autogen: _foreach_floor.out
+- func: _foreach_frac(Tensor[] self) -> Tensor[]
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_frac_slow
+ CUDA: foreach_tensor_frac_cuda
+
+- func: _foreach_frac_(Tensor(a!)[] self) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_frac_slow_
+ CUDA: foreach_tensor_frac_cuda_
+ autogen: _foreach_frac.out
+
+- func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_ternary_lerp_slow
+ CUDA: foreach_tensor_lerp_ternary_cuda
+ autogen: _foreach_lerp.List_out
+
+- func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_ternary_lerp_slow_
+ CUDA: foreach_tensor_lerp_ternary_cuda_
+ autogen: _foreach_lerp.List_out
+
+- func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_lerp_list_kernel_slow
+ CUDA: foreach_tensor_lerp_list_cuda
+ autogen: _foreach_lerp.Scalar_out
+
+- func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_lerp_list_kernel_slow_
+ CUDA: foreach_tensor_lerp_list_cuda_
+ autogen: _foreach_lerp.Scalar_out
+
+- func: _foreach_lgamma(Tensor[] self) -> Tensor[]
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_lgamma_slow
+ CUDA: foreach_tensor_lgamma_cuda
+
+- func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_lgamma_slow_
+ CUDA: foreach_tensor_lgamma_cuda_
+ autogen: _foreach_lgamma.out
+
- func: _foreach_log(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
CPU: foreach_tensor_log_slow
@@ -10345,69 +10758,84 @@
dispatch:
CPU: foreach_tensor_neg_slow_
CUDA: foreach_tensor_neg_cuda_
autogen: _foreach_neg.out
-- func: _foreach_tan(Tensor[] self) -> Tensor[]
+- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_tan_slow
- CUDA: foreach_tensor_tan_cuda
+ CPU: foreach_tensor_norm_slow
+ CUDA: foreach_tensor_norm_cuda
+ autogen: _foreach_norm.Scalar_out
-- func: _foreach_tan_(Tensor(a!)[] self) -> ()
+- func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_tan_slow_
- CUDA: foreach_tensor_tan_cuda_
- autogen: _foreach_tan.out
+ CPU: foreach_tensor_pow_list_kernel_slow
+ CUDA: foreach_tensor_pow_list_kernel_cuda
-- func: _foreach_tanh(Tensor[] self) -> Tensor[]
+- func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_tanh_slow
- CUDA: foreach_tensor_tanh_cuda
+ CPU: foreach_tensor_pow_scalar_kernel_slow
+ CUDA: foreach_tensor_pow_scalar_kernel_cuda
-- func: _foreach_tanh_(Tensor(a!)[] self) -> ()
+- func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_tanh_slow_
- CUDA: foreach_tensor_tanh_cuda_
- autogen: _foreach_tanh.out
+ CPU: foreach_tensor_pow_scalarlist_kernel_slow
+ CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
-- func: _foreach_sin(Tensor[] self) -> Tensor[]
+- func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sin_slow
- CUDA: foreach_tensor_sin_cuda
+ CPU: foreach_scalar_pow_list_kernel_slow
+ CUDA: foreach_scalar_pow_list_kernel_cuda
-- func: _foreach_sin_(Tensor(a!)[] self) -> ()
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+- func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
+ device_check: NoCheck
variants: function
dispatch:
- CPU: foreach_tensor_sin_slow_
- CUDA: foreach_tensor_sin_cuda_
- autogen: _foreach_sin.out
+ CPU: foreach_tensor_pow_list_kernel_slow_
+ CUDA: foreach_tensor_pow_list_kernel_cuda_
+ autogen: _foreach_pow.List_out
-- func: _foreach_sinh(Tensor[] self) -> Tensor[]
+- func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> ()
+ device_check: NoCheck
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_pow_scalar_kernel_slow_
+ CUDA: foreach_tensor_pow_scalar_kernel_cuda_
+ autogen: _foreach_pow.Scalar_out
+
+- func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> ()
+ device_check: NoCheck
+ variants: function
+ dispatch:
+ CPU: foreach_tensor_pow_scalarlist_kernel_slow_
+ CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
+ autogen: _foreach_pow.ScalarList_out
+
+- func: _foreach_reciprocal(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sinh_slow
- CUDA: foreach_tensor_sinh_cuda
+ CPU: foreach_tensor_reciprocal_slow
+ CUDA: foreach_tensor_reciprocal_cuda
-- func: _foreach_sinh_(Tensor(a!)[] self) -> ()
+- func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sinh_slow_
- CUDA: foreach_tensor_sinh_cuda_
- autogen: _foreach_sinh.out
+ CPU: foreach_tensor_reciprocal_slow_
+ CUDA: foreach_tensor_reciprocal_cuda_
+ autogen: _foreach_reciprocal.out
- func: _foreach_round(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
@@ -10420,55 +10848,10 @@
dispatch:
CPU: foreach_tensor_round_slow_
CUDA: foreach_tensor_round_cuda_
autogen: _foreach_round.out
-- func: _foreach_lgamma(Tensor[] self) -> Tensor[]
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
- variants: function
- dispatch:
- CPU: foreach_tensor_lgamma_slow
- CUDA: foreach_tensor_lgamma_cuda
-
-- func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
- variants: function
- dispatch:
- CPU: foreach_tensor_lgamma_slow_
- CUDA: foreach_tensor_lgamma_cuda_
- autogen: _foreach_lgamma.out
-
-- func: _foreach_frac(Tensor[] self) -> Tensor[]
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
- variants: function
- dispatch:
- CPU: foreach_tensor_frac_slow
- CUDA: foreach_tensor_frac_cuda
-
-- func: _foreach_frac_(Tensor(a!)[] self) -> ()
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
- variants: function
- dispatch:
- CPU: foreach_tensor_frac_slow_
- CUDA: foreach_tensor_frac_cuda_
- autogen: _foreach_frac.out
-
-- func: _foreach_reciprocal(Tensor[] self) -> Tensor[]
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
- variants: function
- dispatch:
- CPU: foreach_tensor_reciprocal_slow
- CUDA: foreach_tensor_reciprocal_cuda
-
-- func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
- variants: function
- dispatch:
- CPU: foreach_tensor_reciprocal_slow_
- CUDA: foreach_tensor_reciprocal_cuda_
- autogen: _foreach_reciprocal.out
-
- func: _foreach_sigmoid(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
CPU: foreach_tensor_sigmoid_slow
@@ -10480,155 +10863,131 @@
dispatch:
CPU: foreach_tensor_sigmoid_slow_
CUDA: foreach_tensor_sigmoid_cuda_
autogen: _foreach_sigmoid.out
-- func: _foreach_trunc(Tensor[] self) -> Tensor[]
+- func: _foreach_sign(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_trunc_slow
- CUDA: foreach_tensor_trunc_cuda
+ CPU: foreach_tensor_sign_slow
+ CUDA: foreach_tensor_sign_cuda
-- func: _foreach_trunc_(Tensor(a!)[] self) -> ()
+- func: _foreach_sign_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_trunc_slow_
- CUDA: foreach_tensor_trunc_cuda_
- autogen: _foreach_trunc.out
+ CPU: foreach_tensor_sign_slow_
+ CUDA: foreach_tensor_sign_cuda_
+ autogen: _foreach_sign.out
-- func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+- func: _foreach_sin(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_scalar_slow_
- CUDA: foreach_tensor_addcdiv_scalar_cuda_
- autogen: _foreach_addcdiv.Scalar_out
+ CPU: foreach_tensor_sin_slow
+ CUDA: foreach_tensor_sin_cuda
-- func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+- func: _foreach_sin_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_scalar_slow_
- CUDA: foreach_tensor_addcmul_scalar_cuda_
- autogen: _foreach_addcmul.Scalar_out
+ CPU: foreach_tensor_sin_slow_
+ CUDA: foreach_tensor_sin_cuda_
+ autogen: _foreach_sin.out
-- func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+- func: _foreach_sinh(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_scalarlist_slow_
- CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
- autogen: _foreach_addcdiv.ScalarList_out
+ CPU: foreach_tensor_sinh_slow
+ CUDA: foreach_tensor_sinh_cuda
-- func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+- func: _foreach_sinh_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_tensor_slow_
- CUDA: foreach_tensor_addcdiv_tensor_cuda_
- autogen: _foreach_addcdiv.Tensor_out
+ CPU: foreach_tensor_sinh_slow_
+ CUDA: foreach_tensor_sinh_cuda_
+ autogen: _foreach_sinh.out
-- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+- func: _foreach_sqrt(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_scalarlist_slow_
- CUDA: foreach_tensor_addcmul_scalarlist_cuda_
- autogen: _foreach_addcmul.ScalarList_out
+ CPU: foreach_tensor_sqrt_slow
+ CUDA: foreach_tensor_sqrt_cuda
-- func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+- func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_tensor_slow_
- CUDA: foreach_tensor_addcmul_tensor_cuda_
- autogen: _foreach_addcmul.Tensor_out
+ CPU: foreach_tensor_sqrt_slow_
+ CUDA: foreach_tensor_sqrt_cuda_
+ autogen: _foreach_sqrt.out
-- func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+- func: _foreach_tan(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_scalar_slow
- CUDA: foreach_tensor_addcdiv_scalar_cuda
+ CPU: foreach_tensor_tan_slow
+ CUDA: foreach_tensor_tan_cuda
-- func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+- func: _foreach_tan_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_scalar_slow
- CUDA: foreach_tensor_addcmul_scalar_cuda
+ CPU: foreach_tensor_tan_slow_
+ CUDA: foreach_tensor_tan_cuda_
+ autogen: _foreach_tan.out
-- func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+- func: _foreach_tanh(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_scalarlist_slow
- CUDA: foreach_tensor_addcdiv_scalarlist_cuda
+ CPU: foreach_tensor_tanh_slow
+ CUDA: foreach_tensor_tanh_cuda
-- func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+- func: _foreach_tanh_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_tensor_slow
- CUDA: foreach_tensor_addcdiv_tensor_cuda
+ CPU: foreach_tensor_tanh_slow_
+ CUDA: foreach_tensor_tanh_cuda_
+ autogen: _foreach_tanh.out
-- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+- func: _foreach_trunc(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_scalarlist_slow
- CUDA: foreach_tensor_addcmul_scalarlist_cuda
+ CPU: foreach_tensor_trunc_slow
+ CUDA: foreach_tensor_trunc_cuda
-- func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+- func: _foreach_trunc_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_tensor_slow
- CUDA: foreach_tensor_addcmul_tensor_cuda
+ CPU: foreach_tensor_trunc_slow_
+ CUDA: foreach_tensor_trunc_cuda_
+ autogen: _foreach_trunc.out
-- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
+- func: _foreach_zero_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_norm_slow
- CUDA: foreach_tensor_norm_cuda
- autogen: _foreach_norm.Scalar_out
+ CPU: foreach_tensor_zero_slow_
+ CUDA: foreach_tensor_zero_cuda_
+ autogen: _foreach_zero, _foreach_zero.out
-- func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
+- func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> ()
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_ternary_lerp_slow
- CUDA: foreach_tensor_lerp_ternary_cuda
- autogen: _foreach_lerp.List_out
+ CPU: foreach_tensor_copy_list_kernel_slow_
+ CUDA: foreach_tensor_copy_list_kernel_cuda_
+ autogen: _foreach_copy, _foreach_copy.out
-- func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
- variants: function
- dispatch:
- CPU: foreach_tensor_ternary_lerp_slow_
- CUDA: foreach_tensor_lerp_ternary_cuda_
- autogen: _foreach_lerp.List_out
-
-- func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
- variants: function
- dispatch:
- CPU: foreach_tensor_lerp_list_kernel_slow
- CUDA: foreach_tensor_lerp_list_cuda
- autogen: _foreach_lerp.Scalar_out
-
-- func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
- variants: function
- dispatch:
- CPU: foreach_tensor_lerp_list_kernel_slow_
- CUDA: foreach_tensor_lerp_list_cuda_
- autogen: _foreach_lerp.Scalar_out
-
- func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
dispatch:
CPU: bucketize_cpu
CUDA: bucketize_cuda
@@ -10655,12 +11014,16 @@
- func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor
dispatch:
CPU: searchsorted_cpu
CUDA: searchsorted_cuda
- autogen: searchsorted.Scalar_out
+- func: searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+ dispatch:
+ CPU: searchsorted_out_cpu
+ CUDA: searchsorted_out_cuda
+
- func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor
structured_delegate: _convert_indices_from_coo_to_csr.out
- func: _convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!)
structured: True
@@ -10979,10 +11342,11 @@
structured_inherits: TensorIteratorBase
device_check: NoCheck # TensorIterator
python_module: nn
dispatch:
CPU, CUDA: hardsigmoid_out
+ MPS: hardsigmoid_out_mps
QuantizedCPU: hardsigmoid_out_quantized_cpu
- func: hardsigmoid(Tensor self) -> Tensor
structured_delegate: hardsigmoid.out
device_check: NoCheck # TensorIterator
@@ -10999,10 +11363,11 @@
structured: True
structured_inherits: TensorIteratorBase
python_module: nn
dispatch:
CPU, CUDA: hardsigmoid_backward_out
+ MPS: hardsigmoid_backward_out_mps
- func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
structured_delegate: hardsigmoid_backward.grad_input
python_module: nn
@@ -11117,29 +11482,33 @@
device_check: NoCheck # TensorIterator
python_module: nn
dispatch:
CPU: log_sigmoid_forward_out_cpu
CUDA: log_sigmoid_forward_out_cuda
+ MPS: log_sigmoid_forward_out_mps
- func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
device_check: NoCheck # TensorIterator
python_module: nn
dispatch:
CPU: log_sigmoid_forward_cpu
CUDA: log_sigmoid_forward_cuda
+ MPS: log_sigmoid_forward_mps
- func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
python_module: nn
dispatch:
CPU: log_sigmoid_backward_cpu_out
CUDA: log_sigmoid_backward_cuda_out
+ MPS: log_sigmoid_backward_mps_out
- func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
python_module: nn
dispatch:
CPU: log_sigmoid_backward_cpu
CUDA: log_sigmoid_backward_cuda
+ MPS: log_sigmoid_backward_mps
- func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: nn
tags: nondeterministic_seeded
dispatch:
@@ -11277,10 +11646,11 @@
dispatch:
CPU: adaptive_avg_pool3d_cpu
CUDA: adaptive_avg_pool3d_cuda
QuantizedCPU: adaptive_avg_pool3d_quantized_cpu
autogen: _adaptive_avg_pool3d.out
+ tags: core
- func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
python_module: nn
dispatch:
CPU: adaptive_avg_pool3d_backward_out_cpu
@@ -11392,10 +11762,11 @@
python_module: nn
structured_delegate: avg_pool3d.out
dispatch:
MkldnnCPU: mkldnn_avg_pool3d
QuantizedCPU: avg_pool3d_quantized_cpu
+ tags: core
- func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
python_module: nn
structured: True
dispatch:
@@ -11515,29 +11886,29 @@
python_module: nn
dispatch:
CPU: max_pool3d_with_indices_backward_cpu
CUDA: max_pool3d_with_indices_backward_cuda
-- func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+- func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
python_module: nn
dispatch:
CPU: max_unpooling2d_forward_out_cpu
CUDA: max_unpooling2d_forward_out_cuda
-- func: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor
+- func: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
python_module: nn
dispatch:
CPU: max_unpooling2d_forward_cpu
CUDA: max_unpooling2d_forward_cuda
-- func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
+- func: max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
python_module: nn
dispatch:
CPU: max_unpooling3d_forward_out_cpu
CUDA: max_unpooling3d_forward_out_cuda
-- func: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
+- func: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
python_module: nn
dispatch:
CPU: max_unpooling3d_forward_cpu
CUDA: max_unpooling3d_forward_cuda
@@ -11551,10 +11922,11 @@
MPS: reflection_pad1d_out_mps
- func: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
python_module: nn
structured_delegate: reflection_pad1d.out
+ tags: core
- func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
python_module: nn
structured: True
dispatch:
@@ -11605,10 +11977,11 @@
MPS: reflection_pad3d_out_mps
- func: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
python_module: nn
structured_delegate: reflection_pad3d.out
+ tags: core
- func: reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
python_module: nn
structured: True
dispatch:
@@ -12067,10 +12440,11 @@
python_module: nn
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: logit_backward_out
+ MPS: logit_backward_out_mps
tags: pointwise
- func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
python_module: nn
structured_delegate: logit_backward.grad_input
@@ -12713,161 +13087,233 @@
#
# See fft_fft as an example.
# torch.fft.fft
# NOTE: NOT an alias for torch.fft, which has different semantics
-- func: fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+- func: fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_fft_symint
-- func: fft_fft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_fft_symint_out
-- func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+- func: fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ifft_symint
-- func: fft_ifft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ifft_symint_out
-- func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+- func: fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_rfft_symint
-- func: fft_rfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_rfft_symint_out
-- func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+- func: fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_irfft_symint
-- func: fft_irfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_irfft_symint_out
-- func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+- func: fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_hfft_symint
-- func: fft_hfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_hfft_symint_out
-- func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+- func: fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ihfft_symint
-- func: fft_ihfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ihfft_symint_out
-- func: fft_fft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+- func: fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_fft2_symint
-- func: fft_fft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_fft2_symint_out
-- func: fft_ifft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+- func: fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ifft2_symint
-- func: fft_ifft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ifft2_symint_out
-- func: fft_rfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+- func: fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_rfft2_symint
-- func: fft_rfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_rfft2_symint_out
-- func: fft_irfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+- func: fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_irfft2_symint
-- func: fft_irfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_irfft2_symint_out
-- func: fft_hfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+- func: fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
use_const_ref_for_mutable_tensors: True
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_hfft2_symint
-- func: fft_hfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
use_const_ref_for_mutable_tensors: True
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_hfft2_symint_out
-- func: fft_ihfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+- func: fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
use_const_ref_for_mutable_tensors: True
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ihfft2_symint
-- func: fft_ihfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
use_const_ref_for_mutable_tensors: True
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ihfft2_symint_out
-- func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+- func: fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_fftn_symint
-- func: fft_fftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_fftn_symint_out
-- func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+- func: fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ifftn_symint
-- func: fft_ifftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ifftn_symint_out
-- func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+- func: fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_rfftn_symint
-- func: fft_rfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_rfftn_symint_out
-- func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+- func: fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_irfftn_symint
-- func: fft_irfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_irfftn_symint_out
-- func: fft_hfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+- func: fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
use_const_ref_for_mutable_tensors: True
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_hfftn_symint
-- func: fft_hfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
use_const_ref_for_mutable_tensors: True
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_hfftn_symint_out
-- func: fft_ihfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+- func: fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
use_const_ref_for_mutable_tensors: True
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ihfftn_symint
-- func: fft_ihfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
use_const_ref_for_mutable_tensors: True
python_module: fft
variants: function
+ dispatch:
+ CompositeImplicitAutograd: fft_ihfftn_symint_out
- func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
python_module: fft
variants: function
dispatch:
@@ -13208,10 +13654,11 @@
- func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
python_module: linalg
structured: True
dispatch:
CPU, CUDA: linalg_vector_norm_out
+ MPS: linalg_vector_norm_out_mps
- func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
python_module: linalg
- func: linalg_matrix_norm.out(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
@@ -13786,10 +14233,11 @@
- func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
dispatch:
NestedTensorCPU: NestedTensor_softmax_dropout
NestedTensorCUDA: NestedTensor_softmax_dropout_cuda
+ tags: nondeterministic_seeded
# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
variants: function
dispatch:
@@ -13801,81 +14249,92 @@
dispatch:
CPU, NestedTensorCPU: native_multi_head_attention_cpu
CUDA, NestedTensorCUDA: native_multi_head_attention_cuda
autogen: _native_multi_head_attention.out
-- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False) -> Tensor
+- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor
python_module: nn
variants: function
autogen: scaled_dot_product_attention.out
+ tags: nondeterministic_seeded
-# TODO: THIS NEEDS TO BE REMOVED BUT PEOPLE HAVE TRAINED THEIR MODELS WITH THIS OP BUILTIN
-- func: _scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
- python_module: nn
- variants: function
- autogen: _scaled_dot_product_attention.out
-
# This aten function is kept so that we can test the choice function from Python
-- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False) -> int
+- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int
dispatch:
Meta: _fused_sdp_choice_meta
CPU, NestedTensorCPU: _fused_sdp_choice_cpp
CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
+ tags: nondeterministic_seeded
-- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None) -> (Tensor, Tensor)
+- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor)
variants: function
+ tags: nondeterministic_seeded
-- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, int philox_seed, int philox_offset, Tensor debug_attn_mask)
+- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
dispatch:
+ CPU: _scaled_dot_product_flash_attention_cpu
CUDA: _scaled_dot_product_flash_attention_cuda
NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
+ tags: nondeterministic_seeded
-- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, int philox_seed, int philox_offset) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+ device_check: NoCheck
variants: function
dispatch:
+ CPU: _scaled_dot_product_flash_attention_backward_cpu
CUDA: _scaled_dot_product_flash_attention_backward_cuda
-- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor)
+- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
dispatch:
CUDA: _scaled_dot_product_efficient_attention_cuda
NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda
+ tags: nondeterministic_seeded
-- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False, bool chunk_grad_outputs=False) -> (Tensor, Tensor, Tensor)
+- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor)
+ device_check: NoCheck
dispatch:
CUDA: _scaled_dot_product_efficient_attention_backward_cuda
+ tags: nondeterministic_seeded
-- func: _chunk_grad_outputs_efficient_attention(Tensor query, Tensor key, Tensor value, bool is_causal=False) -> bool
- dispatch:
- CUDA: _chunk_grad_outputs_efficient_attention
-
-- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask) -> (Tensor output, Tensor softmax_logsumexp, int philox_seed, int philox_offset, Tensor debug_attn_mask)
+- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
variants: function
dispatch:
CUDA: _flash_attention_forward
+ tags: nondeterministic_seeded
-- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, int philox_seed, int philox_offset) -> (Tensor, Tensor, Tensor)
+- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+ device_check: NoCheck
variants: function
dispatch:
CUDA: _flash_attention_backward
# Returns ouput, logsumexp if compute_logsumexp
-- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
+- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
variants: function
dispatch:
CUDA: _efficient_attention_forward
+ tags: nondeterministic_seeded
-- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False, bool chunk_grad_outputs=False) -> (Tensor, Tensor, Tensor)
+- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_k, int max_seqlen_q, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
+ device_check: NoCheck
variants: function
dispatch:
CUDA: _efficient_attention_backward
- func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
variants: function
dispatch:
CUDA: triton_scaled_dot_attention
+ tags: nondeterministic_seeded
autogen: _triton_scaled_dot_attention.out
+- func: _fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!)
+ variants: function
+ dispatch:
+ CUDA: _fill_mem_eff_dropout_mask_
+ tags: nondeterministic_seeded
+
- func: _triton_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor
variants: function
dispatch:
CUDA: triton_multi_head_attention
autogen: _triton_multi_head_attention.out
@@ -13893,22 +14352,10 @@
structured_inherits: TensorIteratorBase
structured: True
variants: function
tags: pointwise
-- func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
- variants: function
- dispatch:
- CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_decoder_only_layer_forward
- autogen: _transformer_decoder_only_layer_fwd.out
-
-- func: _native_decoder_only_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None, bool need_weights=True, bool average_attn_weights=True) -> (Tensor, Tensor, Tensor, Tensor)
- variants: function
- dispatch:
- CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_decoder_only_multi_head_attention
- autogen: _native_decoder_only_multi_head_attention.out
-
- func: special_bessel_j0(Tensor self) -> Tensor
python_module: special
structured_delegate: special_bessel_j0.out
variants: function
tags: pointwise
@@ -14601,11 +15048,33 @@
variants: function
dispatch:
CUDA: _fused_adam_kernel_cuda_
autogen: _fused_adam, _fused_adam.out
+- func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
+ # but still skip the device check as the Tensor LR can be on CPU
+ device_check: NoCheck
+ variants: function
+ dispatch:
+ CUDA: _fused_adam_kernel_cuda_
+ autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
+
- func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
# Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
variants: function
dispatch:
CUDA: _fused_adamw_kernel_cuda_
autogen: _fused_adamw, _fused_adamw.out
+
+- func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
+ # but still skip the device check as the Tensor LR can be on CPU
+ device_check: NoCheck
+ variants: function
+ dispatch:
+ CUDA: _fused_adamw_kernel_cuda_
+ autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
+
+# This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
+- func: _propagate_xla_data(Tensor input, Tensor output) -> ()
+ variants: function