codegen/native_functions.yaml in torch-rb-0.13.2 vs codegen/native_functions.yaml in torch-rb-0.14.0

- old
+ new

@@ -168,13 +168,41 @@ - func: _assert_async(Tensor self) -> () dispatch: CPU: _assert_async_cpu CUDA: _assert_async_cuda +- func: _assert_async.msg(Tensor self, str assert_msg) -> () + dispatch: + CPU: _assert_async_msg_cpu + CUDA: _assert_async_msg_cuda -- func: _assert_tensor_metadata(Tensor a, int[]? size=None, int[]? stride=None, ScalarType? dtype=None) -> () +- func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor + dispatch: + CPU: _functional_assert_async_msg_cpu +- func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> () + +- func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> () + dispatch: + CompositeExplicitAutograd: sym_constrain_range + +- func: sym_constrain_range_for_size(Scalar size, *, int? min, int? max) -> () + dispatch: + CompositeExplicitAutograd: sym_constrain_range_for_size + +- func: _functional_sym_constrain_range(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor + dispatch: + CompositeExplicitAutograd: _functional_sym_constrain_range + +- func: _functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor + dispatch: + CompositeExplicitAutograd: _functional_sym_constrain_range_for_size + +- func: _make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + dispatch: + CPU: _make_dep_token_cpu + - func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a) variants: method - func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool device_check: NoCheck # Tensor arguments allowed to be on different devices, see also _cudnn_ctc_loss @@ -209,20 +237,22 @@ # Disable dispatch time device check for consistent behavior. device_check: NoCheck dispatch: CUDA: _cudnn_rnn autogen: _cudnn_rnn.out + tags: nondeterministic_seeded - func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[]) dispatch: CUDA: _cudnn_rnn_backward autogen: _cudnn_rnn_backward.out - func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor dispatch: CUDA: _cudnn_init_dropout_state autogen: _cudnn_init_dropout_state.out + tags: nondeterministic_seeded - func: _debug_has_internal_overlap(Tensor self) -> int variants: function - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) @@ -295,19 +325,21 @@ variants: function, method dispatch: CompositeExplicitAutograd: abs SparseCPU, SparseCUDA: abs_sparse SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs tags: [core, pointwise] - func: abs_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: abs_ SparseCPU, SparseCUDA: abs_sparse_ SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_ - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: abs_out @@ -372,26 +404,28 @@ CPU, CUDA, MPS, Meta: view_as_real - func: view_as_complex(Tensor(a) self) -> Tensor(a) variants: function dispatch: - CPU, CUDA, Meta: view_as_complex + CPU, CUDA, MPS, Meta: view_as_complex - func: sgn(Tensor self) -> Tensor variants: function, method structured_delegate: sgn.out dispatch: SparseCPU, SparseCUDA: sgn_sparse SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr + NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn tags: pointwise - func: sgn_(Tensor(a!) self) -> Tensor(a!) variants: method structured_delegate: sgn.out dispatch: SparseCPU, SparseCUDA: sgn_sparse_ SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_ tags: pointwise - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase @@ -486,12 +520,14 @@ variants: function, method - func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor + tags: core - func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor + tags: core # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor) - func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor @@ -608,17 +644,17 @@ dispatch: CPU, CUDA: addr_out MPS: addr_out_mps CompositeExplicitAutograd: math_addr_out -- func: affine_grid_generator(Tensor theta, int[] size, bool align_corners) -> Tensor +- func: affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor variants: function dispatch: CompositeExplicitAutograd: affine_grid_generator autogen: affine_grid_generator.out -- func: affine_grid_generator_backward(Tensor grad, int[] size, bool align_corners) -> Tensor +- func: affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor variants: function - func: _is_all_true(Tensor self) -> Tensor variants: function, method dispatch: @@ -631,10 +667,17 @@ # Note: this function is only for testing. - func: _test_check_tensor(Tensor self) -> Tensor variants: function +# Note; this function is only for testing +- func: _test_functorch_fallback(Tensor self, Tensor other) -> Tensor + variants: function + dispatch: + CPU: _test_functorch_fallback + autogen: _test_functorch_fallback.out + - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: all.out variants: function, method @@ -662,10 +705,11 @@ - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: any.out variants: function, method + tags: core - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True precomputed: @@ -1106,18 +1150,19 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: bitwise_not_out + MPS: bitwise_not_out_mps tags: pointwise - func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: copysign_out + CPU, CUDA, MPS: copysign_out tags: pointwise - func: copysign.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -1148,17 +1193,19 @@ - func: logical_not(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: logical_not + NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not tags: [core, pointwise] - func: logical_not_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: logical_not_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_ tags: pointwise - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: @@ -1169,11 +1216,11 @@ - func: logical_xor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: logical_xor - tags: pointwise + tags: [core, pointwise] - func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: @@ -1324,11 +1371,11 @@ structured_delegate: ceil.out variants: function, method dispatch: SparseCPU, SparseCUDA: ceil_sparse SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr - tags: pointwise + tags: [core, pointwise] - func: ceil_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: ceil.out variants: function, method @@ -1391,11 +1438,11 @@ tags: [core, pointwise] - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor variants: function, method structured_delegate: clamp.Tensor_out - tags: pointwise + tags: [core, pointwise] - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method cpp_no_default_args: ['min'] @@ -1550,10 +1597,11 @@ CompositeExplicitAutograd: polar - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: polar_out + MPS: polar_out_mps - func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor variants: function dispatch: CompositeExplicitAutograd: constant_pad_nd @@ -1596,15 +1644,21 @@ - func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, int[] stride, str padding, int[] dilation, int groups) -> Tensor - func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) -- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor +- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv1d_symint -- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor +- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv2d_symint -- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor +- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv3d_symint - func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, str padding="valid", int[1] dilation=1, int groups=1) -> Tensor cpp_no_default_args: ['bias', 'stride', 'padding'] - func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, str padding="valid", int[2] dilation=1, int groups=1) -> Tensor @@ -1619,15 +1673,21 @@ autogen: conv_tbc.out - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor) # NB: we inherit the goofy argument order from PyTorch torch.nn.functional -- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor +- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv_transpose1d_symint -- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor +- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv_transpose2d_symint -- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor +- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor + dispatch: + CompositeImplicitAutograd: conv_transpose3d_symint - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor variants: function dispatch: CompositeExplicitAutogradNonFunctional: copy @@ -1848,10 +1908,11 @@ - func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) structured: True device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: cumprod_out + MPS: cumprod_out_mps - func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -1868,10 +1929,11 @@ - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor structured_delegate: cumsum.out device_check: NoCheck # TensorIterator variants: function, method + tags: core - func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) structured_delegate: cumsum.out variants: method @@ -2143,10 +2205,11 @@ - func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor dispatch: CompositeExplicitAutograd: embedding_symint NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding autogen: embedding.out + tags: core - func: embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor dispatch: CompositeImplicitAutograd: embedding_backward_symint @@ -2200,10 +2263,11 @@ - func: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor) dispatch: CPU: _embedding_bag_cpu CUDA: _embedding_bag_cuda autogen: _embedding_bag.out + tags: core - func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor dispatch: CompositeImplicitAutograd: _embedding_bag_backward_symint @@ -2238,11 +2302,17 @@ Meta: empty_meta_symint MkldnnCPU: empty_mkldnn SparseCPU, SparseCUDA, SparseMeta: empty_sparse SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized + tags: core +- func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeExplicitAutograd: empty_permuted_symint + autogen: empty_permuted.out + # We do not make new_empty a composite that calls into new_empty_strided, as the strided version # is significantly more difficult to implement by different backends - func: new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor variants: method dispatch: @@ -2278,19 +2348,19 @@ # non-differentiable so NonFunctional doesn't apply CompositeExplicitAutograd: new_ones autogen: new_ones.out # other overrides are to provide a more helpful error message that dtype is required -- func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor +- func: _empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor dispatch: CPU: empty_affine_quantized_other_backends_stub QuantizedCPU, QuantizedCUDA: empty_affine_quantized autogen: _empty_affine_quantized.out # it's a factory function receiving a tensor argument, thus overriding explicitly # other overrides are to provide a more helpful error message that dtype is required -- func: _empty_per_channel_affine_quantized(int[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor +- func: _empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor category_override: factory dispatch: CPU: empty_per_channel_affine_quantized_other_backends_stub QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized autogen: _empty_per_channel_affine_quantized.out @@ -2311,11 +2381,11 @@ autogen: resize, resize.out # This is a utility function to enable users to resize out tensor while registering kernels for out variants. # Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration # to make it easy to register out variants for ops. -- func: _resize_output_(Tensor(a!) self, int[] size, Device device) -> Tensor(a!) +- func: _resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!) use_const_ref_for_mutable_tensors: True variants: function dispatch: Meta: _resize_output_ autogen: _resize_output, _resize_output.out @@ -2481,25 +2551,25 @@ variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. device_check: NoCheck device_guard: False # decomposes to eye.m -- func: eye(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CompositeExplicitAutograd: eye -- func: eye.m(int n, int m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CompositeExplicitAutograd: eye -- func: eye.out(int n, *, Tensor(a!) out) -> Tensor(a!) +- func: eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, Meta: eye_out_cpu CUDA: eye_out_cuda MPS: eye_out_mps -- func: eye.m_out(int n, int m, *, Tensor(a!) out) -> Tensor(a!) +- func: eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, Meta: eye_out_cpu CUDA: eye_out_cuda MPS: eye_out_mps @@ -2513,15 +2583,19 @@ variants: function, method - func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a) variants: function, method -- func: unflatten.int(Tensor(a) self, int dim, int[] sizes) -> Tensor(a) +- func: unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a) variants: function, method + dispatch: + CompositeImplicitAutograd: unflatten_symint -- func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a) +- func: unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a) variants: function, method + dispatch: + CompositeImplicitAutograd: unflatten_dimname_symint - func: fill.Scalar(Tensor self, Scalar value) -> Tensor variants: function dispatch: CompositeExplicitAutograd: fill @@ -2837,17 +2911,17 @@ dispatch: CPU: _fft_r2c_mkl_out CUDA: _fft_r2c_cufft_out # Complex to real inverse FFT -- func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor +- func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor variants: function dispatch: CPU: _fft_c2r_mkl CUDA: _fft_c2r_cufft -- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, int last_dim_size, *, Tensor(a!) out) -> Tensor(a!) +- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: CPU: _fft_c2r_mkl_out CUDA: _fft_c2r_cufft_out @@ -2869,25 +2943,25 @@ variants: function dispatch: CPU: _validate_compressed_sparse_indices_cpu CUDA: _validate_compressed_sparse_indices_cuda -- func: _cufft_get_plan_cache_size(int device_index) -> int +- func: _cufft_get_plan_cache_size(DeviceIndex device_index) -> int -- func: _cufft_get_plan_cache_max_size(int device_index) -> int +- func: _cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int -- func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> () +- func: _cufft_set_plan_cache_max_size(DeviceIndex device_index, int max_size) -> () -- func: _cufft_clear_plan_cache(int device_index) -> () +- func: _cufft_clear_plan_cache(DeviceIndex device_index) -> () - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: index.Tensor_out variants: function, method dispatch: QuantizedCPU: quantized_index - tags: dynamic_output_shape + tags: [core, dynamic_output_shape] # NB: This function is special-cased in tools/autograd/gen_variable_type.py # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: # - Tensor Tensor::index(ArrayRef<TensorIndex> indices) # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices) @@ -2898,10 +2972,17 @@ precomputed: - indices -> DimVector sizes, DimVector strides dispatch: CPU, CUDA, MPS: index_out +# Used by inductor to signal indexing without bounds checks +# Note that we don't support boolean indexing, to avoid dynamic output shapes +- func: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor + variants: function + dispatch: + CPU, CUDA: _unsafe_index + - func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!) structured: True variants: function precomputed: - dim -> int dim @@ -2937,11 +3018,18 @@ - func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor device_check: NoCheck # delegate to _index_put_impl_ after clone, which leverages TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: index_put + tags: core +- func: _unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor + device_check: NoCheck # delegate to _index_put_impl_ after clone, which leverages TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: _unsafe_index_put + - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: CPU, CUDA, MPS: _index_put_impl_ @@ -3095,10 +3183,11 @@ - func: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: CPU: layer_norm_backward_cpu CUDA: layer_norm_backward_cuda MPS: layer_norm_backward_mps + NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested autogen: native_layer_norm_backward.out tags: core - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor variants: function, method @@ -3158,10 +3247,22 @@ - func: mkldnn_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: MkldnnCPU: mkldnn_linear_backward autogen: mkldnn_linear_backward.out +- func: _cslt_compress(Tensor input) -> Tensor + dispatch: + CUDA: _cslt_compress + +- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, bool transpose_result=False) -> Tensor + dispatch: + CUDA: _cslt_sparse_mm + +- func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_linear + - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int) @@ -3353,10 +3454,11 @@ structured: True structured_inherits: TensorIteratorBase variants: function dispatch: CPU, CUDA: xlogy_out + MPS: xlogy_out_mps tags: pointwise - func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function @@ -3508,10 +3610,11 @@ - func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max) device_check: NoCheck # TensorIterator structured: True dispatch: CPU, CUDA: aminmax_out + MPS: aminmax_out_mps - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor dispatch: CPU, CUDA: _compute_linear_combination @@ -3605,19 +3708,25 @@ dispatch: QuantizedCPU: quantized_max_pool2d QuantizedCUDA: quantized_max_pool2d_cudnn autogen: quantized_max_pool2d.out +- func: quantized_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + QuantizedCPU: quantized_max_pool3d + autogen: quantized_max_pool3d.out + - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor # The CPU and GPU dispatch variants are named weirdly here because otherwise there # are namespacing issues in C++ - func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: mean + tags: core # For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this. # FIXME: fix CI jobs and re-enable this #- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) # device_check: NoCheck # TensorIterator @@ -3754,10 +3863,11 @@ autogen: mkldnn_convolution.out - func: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor) dispatch: CPU: mkldnn_rnn_layer + MkldnnCPU: mkldnn_rnn_layer autogen: mkldnn_rnn_layer.out - func: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) dispatch: CPU: mkldnn_rnn_layer_backward @@ -3798,11 +3908,13 @@ - func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) dispatch: CUDA: miopen_rnn autogen: miopen_rnn.out + tags: nondeterministic_seeded + - func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[]) dispatch: CUDA: miopen_rnn_backward autogen: miopen_rnn_backward.out @@ -3821,10 +3933,18 @@ CUDA: mm_out_cuda MPS: mm_out_mps SparseCPU, SparseCUDA: _sparse_mm_out SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out +- func: _int_mm(Tensor self, Tensor mat2) -> Tensor + dispatch: + CUDA: _int_mm_cuda + +- func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CUDA: _int_mm_out_cuda + - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor python_module: sparse - func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor python_module: sparse @@ -3979,11 +4099,10 @@ dispatch: CPU: batch_norm_cpu CUDA: batch_norm_cuda MPS: batch_norm_mps MkldnnCPU: mkldnn_batch_norm - tags: core - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!)) dispatch: CUDA: batch_norm_cuda_out MPS: batch_norm_mps_out @@ -3995,11 +4114,21 @@ CPU: _batch_norm_legit_cpu CUDA: _batch_norm_legit_cuda MPS: _batch_norm_legit_mps MkldnnCPU: _mkldnn_batch_norm_legit autogen: _native_batch_norm_legit_functional + tags: core +# HACK: identical to _native_batch_norm_legit, but training is known to be False, +# So we known that running stats will not be mutated. +# The real fix here is batch norm consolidation. +- func: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor) + dispatch: + CompositeExplicitAutograd: _batch_norm_legit_no_training + autogen: _native_batch_norm_legit_no_training.out + tags: core + - func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!)) dispatch: CPU: _batch_norm_legit_cpu_out CUDA: _batch_norm_legit_cuda_out MPS: _batch_norm_legit_mps_out @@ -4053,11 +4182,11 @@ - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor) dispatch: CUDA: batch_norm_backward_reduce_cuda autogen: batch_norm_backward_reduce.out -- func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor mean_dy, Tensor mean_dy_xmu, Tensor count) -> Tensor +- func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor dispatch: CUDA: batch_norm_backward_elemt_cuda autogen: batch_norm_backward_elemt.out - func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor) @@ -4111,10 +4240,11 @@ - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor dispatch: CPU, CUDA: _cdist_forward MPS: _cdist_forward_mps autogen: _cdist_forward.out + tags: core - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor dispatch: CPU, CUDA: _cdist_backward autogen: _cdist_backward.out @@ -4123,10 +4253,11 @@ - func: _pdist_forward(Tensor self, float p=2) -> Tensor dispatch: CPU, CUDA: _pdist_forward autogen: _pdist_forward.out + tags: core - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor dispatch: CPU, CUDA: _pdist_backward autogen: _pdist_backward.out @@ -4183,20 +4314,21 @@ - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor dispatch: CPU: pixel_shuffle_cpu CompositeExplicitAutogradNonFunctional: math_pixel_shuffle autogen: pixel_shuffle.out + tags: core - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor dispatch: CPU: pixel_unshuffle_cpu CompositeExplicitAutogradNonFunctional: math_pixel_unshuffle autogen: pixel_unshuffle.out - func: channel_shuffle(Tensor self, int groups) -> Tensor dispatch: - CPU: channel_shuffle + CPU, CUDA: channel_shuffle QuantizedCPU: channel_shuffle_quantized_cpu autogen: channel_shuffle.out - func: native_channel_shuffle(Tensor self, int groups) -> Tensor dispatch: @@ -4292,11 +4424,11 @@ dispatch: CompositeExplicitAutograd: rand autogen: rand.generator_with_names_out - func: rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - tags: nondeterministic_seeded + tags: [core, nondeterministic_seeded] dispatch: CompositeExplicitAutograd: rand - func: rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor tags: nondeterministic_seeded @@ -4317,68 +4449,68 @@ # NB: Although this composite mutates on the inside, it is # non-differentiable so NonFunctional doesn't apply CompositeExplicitAutograd: rand_like autogen: rand_like.out -- func: randint(int high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor tags: nondeterministic_seeded dispatch: CompositeExplicitAutograd: randint -- func: randint.generator(int high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor tags: nondeterministic_seeded dispatch: CompositeExplicitAutograd: randint -- func: randint.low(int low, int high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor tags: nondeterministic_seeded dispatch: CompositeExplicitAutograd: randint -- func: randint.low_generator(int low, int high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor tags: nondeterministic_seeded dispatch: CompositeExplicitAutograd: randint -- func: randint.out(int high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) +- func: randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) tags: nondeterministic_seeded dispatch: CompositeExplicitAutograd: randint_out -- func: randint.generator_out(int high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) +- func: randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) tags: nondeterministic_seeded dispatch: CompositeExplicitAutograd: randint_out -- func: randint.low_out(int low, int high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) +- func: randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) tags: nondeterministic_seeded dispatch: CompositeExplicitAutograd: randint_out -- func: randint.low_generator_out(int low, int high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) +- func: randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) tags: nondeterministic_seeded dispatch: CompositeExplicitAutograd: randint_out -- func: randint_like(Tensor self, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +- func: randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor tags: nondeterministic_seeded dispatch: # NB: Although this composite mutates on the inside, it is # non-differentiable so NonFunctional doesn't apply CompositeExplicitAutograd: randint_like autogen: randint_like.out -- func: randint_like.low_dtype(Tensor self, int low, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +- func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor tags: nondeterministic_seeded dispatch: # NB: Although this composite mutates on the inside, it is # non-differentiable so NonFunctional doesn't apply CompositeExplicitAutograd: randint_like autogen: randint_like.low_dtype_out - func: randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - tags: nondeterministic_seeded + tags: [core, nondeterministic_seeded] dispatch: CompositeExplicitAutograd: randn - func: randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor tags: nondeterministic_seeded @@ -4410,29 +4542,29 @@ - func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor tags: nondeterministic_seeded dispatch: # NB: Although this composite mutates on the inside, it is # non-differentiable so NonFunctional doesn't apply - CompositeExplicitAutograd: randn_like + CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: randn_like autogen: randn_like.out -- func: randperm(int n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - tags: nondeterministic_seeded +- func: randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + tags: [core, nondeterministic_seeded] dispatch: CompositeExplicitAutograd: randperm -- func: randperm.generator(int n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor tags: nondeterministic_seeded dispatch: CompositeExplicitAutograd: randperm -- func: randperm.out(int n, *, Tensor(a!) out) -> Tensor(a!) +- func: randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!) tags: nondeterministic_seeded dispatch: CompositeExplicitAutograd: randperm_out -- func: randperm.generator_out(int n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) +- func: randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) tags: nondeterministic_seeded dispatch: CPU: randperm_out_cpu CUDA: randperm_out_cuda MPS: randperm_out_mps @@ -4589,11 +4721,11 @@ structured_delegate: round.out variants: function, method dispatch: SparseCPU, SparseCUDA: round_sparse SparseCsrCPU, SparseCsrCUDA: round_sparse_csr - tags: pointwise + tags: [core, pointwise] - func: round_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: round.out variants: function, method @@ -4837,14 +4969,18 @@ autogen: celu.out - func: silu(Tensor self) -> Tensor structured_delegate: silu.out python_module: nn + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu - func: silu_(Tensor(a!) self) -> Tensor(a!) structured_delegate: silu.out python_module: nn + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_ - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase python_module: nn @@ -4863,10 +4999,11 @@ - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor structured_delegate: silu_backward.grad_input python_module: nn dispatch: CompositeImplicitAutograd: math_silu_backward + NestedTensorCPU, NestedTensorCUDA: silu_backward_nested - func: mish(Tensor self) -> Tensor structured_delegate: mish.out python_module: nn @@ -4915,10 +5052,11 @@ - func: logit(Tensor self, float? eps=None) -> Tensor variants: function, method dispatch: CPU, CUDA: logit + MPS: logit_mps tags: pointwise - func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!) variants: function, method dispatch: @@ -4926,10 +5064,11 @@ tags: pointwise - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: logit_out + MPS: logit_out_mps tags: pointwise - func: sin(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: sin.out @@ -5040,10 +5179,31 @@ - func: size.Dimname(Tensor self, Dimname dim) -> int variants: function, method device_check: NoCheck device_guard: False +- func: sym_size.int(Tensor self, int dim) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + +- func: sym_numel(Tensor self) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + +- func: sym_storage_offset(Tensor self) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + - func: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) variants: function, method device_check: NoCheck device_guard: False dispatch: @@ -5064,36 +5224,37 @@ - func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor variants: function, method device_check: NoCheck device_guard: False dispatch: - CompositeExplicitAutograd: slice_scatter + CompositeExplicitAutogradNonFunctional: slice_scatter autogen: slice_scatter.out tags: core - func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor variants: function, method device_check: NoCheck device_guard: False dispatch: - CompositeExplicitAutograd: select_scatter_symint + CompositeExplicitAutogradNonFunctional: select_scatter_symint autogen: select_scatter.out + tags: core - func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor variants: function, method device_check: NoCheck device_guard: False dispatch: - CompositeExplicitAutograd: diagonal_scatter + CompositeExplicitAutogradNonFunctional: diagonal_scatter autogen: diagonal_scatter.out - func: as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor variants: function, method device_check: NoCheck device_guard: False dispatch: - CompositeExplicitAutograd: as_strided_scatter_symint + CompositeExplicitAutogradNonFunctional: as_strided_scatter_symint autogen: as_strided_scatter.out - func: smm(Tensor self, Tensor mat2) -> Tensor variants: function, method @@ -5168,10 +5329,12 @@ variants: function, method device_check: NoCheck device_guard: False dispatch: CompositeExplicitAutograd: split_with_sizes + NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested + tags: core - func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[] variants: function, method - func: hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[] @@ -5314,26 +5477,35 @@ - func: stride.Dimname(Tensor self, Dimname dim) -> int variants: function, method device_check: NoCheck device_guard: False +- func: sym_stride.int(Tensor self, int dim) -> SymInt + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: sum SparseCPU, SparseCUDA: sum_coo SparseCsrCPU, SparseCsrCUDA: sum_csr autogen: sum.out - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype structured_delegate: sum.IntList_out device_check: NoCheck # TensorIterator variants: function, method dispatch: NestedTensorCPU: NestedTensor_sum_dim_CPU SparseCPU, SparseCUDA: sum_sparse_coo + SparseCsrCPU, SparseCsrCUDA: sum_sparse_compressed tags: core - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -5362,14 +5534,16 @@ - func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: nansum_out MPS: nansum_out_mps -- func: sum_to_size(Tensor self, int[] size) -> Tensor +- func: sum_to_size(Tensor self, SymInt[] size) -> Tensor variants: method device_check: NoCheck device_guard: False + dispatch: + CompositeImplicitAutograd: sum_to_size_symint - func: sqrt(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: sqrt.out variants: function, method @@ -5419,11 +5593,11 @@ - func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method cpp_no_default_args: ["unbiased"] -- func: std.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor +- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: std MPS: std_mps @@ -5437,11 +5611,11 @@ - func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function cpp_no_default_args: ["unbiased"] -- func: std_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor) +- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function dispatch: CPU, CUDA: std_mean autogen: std_mean.correction_out @@ -5449,19 +5623,19 @@ - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function cpp_no_default_args: ["unbiased"] -- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor) +- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function - func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator cpp_no_default_args: ["unbiased"] -- func: std.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) +- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: std_out QuantizedCPU: std_out_quantized_cpu @@ -5472,30 +5646,32 @@ - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator cpp_no_default_args: ["unbiased"] -- func: std.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor +- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method -- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) +- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: prod MPS: prod_mps autogen: prod.out + tags: core - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor structured_delegate: prod.int_out device_check: NoCheck # TensorIterator variants: function, method + tags: core - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) structured: True device_check: NoCheck # TensorIterator dispatch: @@ -5529,11 +5705,11 @@ structured_delegate: tan.out variants: function, method dispatch: SparseCPU, SparseCUDA: tan_sparse SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr - tags: pointwise + tags: [core, pointwise] - func: tan_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: tan.out variants: function, method @@ -5590,12 +5766,10 @@ - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor variants: function - func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!) variants: function - dispatch: - CPU, CUDA: tensordot_out # TODO: namespace threshold in 'nn' - func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor device_check: NoCheck # TensorIterator variants: function @@ -5633,12 +5807,14 @@ SparseCPU, SparseCUDA: threshold_backward_sparse SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested tags: pointwise -- func: tile(Tensor self, int[] dims) -> Tensor +- func: tile(Tensor self, SymInt[] dims) -> Tensor variants: function, method + dispatch: + CompositeImplicitAutograd: tile_symint - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a) variants: function, method device_check: NoCheck device_guard: False @@ -5689,16 +5865,17 @@ variants: function, method - func: flipud(Tensor self) -> Tensor variants: function, method -- func: roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor +- func: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor variants: function, method dispatch: - CPU: roll_cpu + CPU, MPS: roll CUDA: roll_cuda autogen: roll.out + tags: core # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor variants: function, method @@ -5748,31 +5925,32 @@ variants: method dispatch: NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides autogen: _nested_tensor_strides.out -- func: _nested_tensor_offsets(Tensor self) -> int[] +- func: _nested_tensor_storage_offsets(Tensor self) -> Tensor variants: method dispatch: - NestedTensorCPU, NestedTensorCUDA: _nested_tensor_offsets + NestedTensorCPU, NestedTensorCUDA, NestedTensorMeta: _nested_tensor_storage_offsets + autogen: _nested_tensor_storage_offsets.out # _nested_from_padded is not usable from Python, so # _nested_from_padded_and_nested_example is available for testing. - func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor dispatch: NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example autogen: _nested_from_padded_and_nested_example.out # The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation # this will need to be updated -- func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor(a) +- func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a) variants: function device_check: NoCheck dispatch: CPU, CUDA: _nested_view_from_buffer -- func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor +- func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor variants: function device_check: NoCheck tags: view_copy dispatch: CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy @@ -5911,22 +6089,23 @@ device_check: NoCheck # TensorIterator variants: function, method tags: core cpp_no_default_args: ["unbiased"] -- func: var.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor +- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: var MPS: var_mps + tags: core - func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator cpp_no_default_args: ["unbiased"] -- func: var.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) +- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: var_out - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor @@ -5936,15 +6115,15 @@ - func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator cpp_no_default_args: ["unbiased"] -- func: var.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor +- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method -- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) +- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator @@ -5954,11 +6133,11 @@ - func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function cpp_no_default_args: ["unbiased"] -- func: var_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor) +- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function dispatch: CPU, CUDA: var_mean autogen: var_mean.correction_out @@ -5966,11 +6145,11 @@ - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function cpp_no_default_args: ["unbiased"] -- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor) +- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function - func: view_as(Tensor(a) self, Tensor other) -> Tensor(a) variants: method @@ -6034,11 +6213,11 @@ device_guard: False dispatch: CompositeExplicitAutograd: zeros autogen: zeros.names_out -- func: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CPU: _efficientzerotensor CUDA: _efficientzerotensor_cuda Meta: _efficientzerotensor_meta autogen: _efficientzerotensor.out @@ -6054,11 +6233,11 @@ - func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor dispatch: # NB: Although this composite mutates on the inside, it is # non-differentiable so NonFunctional doesn't apply - CompositeExplicitAutograd: zeros_like + CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: zeros_like autogen: zeros_like.out - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor variants: function dispatch: @@ -6295,11 +6474,11 @@ SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed MkldnnCPU: mkldnn_clone QuantizedCPU, QuantizedCUDA: quantized_clone NestedTensorCPU, NestedTensorCUDA: clone_nested autogen: clone.out - tags: core + tags: [core, pointwise] - func: positive(Tensor(a) self) -> Tensor(a) variants: function, method tags: pointwise @@ -6307,10 +6486,11 @@ use_const_ref_for_mutable_tensors: True variants: function, method dispatch: CompositeExplicitAutograd: resize_as_ autogen: resize_as, resize_as.out + tags: inplace_view - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!) use_const_ref_for_mutable_tensors: True variants: function, method dispatch: @@ -6326,10 +6506,11 @@ MPS: zero_mps_ Meta: zero_meta_ SparseCPU, SparseCUDA, SparseMeta: zero_sparse_ SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_ MkldnnCPU: mkldnn_zero_ + NestedTensorCPU, NestedTensorCUDA: zero_nested_ autogen: zero, zero.out - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True @@ -6345,10 +6526,11 @@ variants: function, method structured_delegate: sub.out dispatch: SparseCPU, SparseCUDA: sub_sparse ZeroTensor: sub_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor tags: [core, pointwise] - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method @@ -6491,10 +6673,20 @@ - func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor structured_delegate: _addmm_activation.out variants: function, method +- func: _scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: _scaled_mm_cuda + +- func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!)) + variants: function + dispatch: + CUDA: _scaled_mm_out_cuda + # NOTE [ Sparse: autograd and API ] # # # Sparse Tensor Constructors # ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -6603,16 +6795,21 @@ # sparse tensor. # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given # the default would never make sense. - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CompositeExplicitAutograd: sparse_compressed_tensor + - func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CompositeExplicitAutograd: sparse_compressed_tensor - func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor @@ -6625,19 +6822,19 @@ - func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor dispatch: CompositeExplicitAutograd: sparse_coo_tensor autogen: sparse_coo_tensor.size_out -- func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor -- func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor -- func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor dispatch: CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint -- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> () +- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> () - func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> () - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () - func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> () - func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () @@ -6646,11 +6843,11 @@ - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor dispatch: SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_sparse autogen: _sparse_coo_tensor_with_dims.out -- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor dispatch: SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse_symint autogen: _sparse_coo_tensor_with_dims_and_tensors.out - func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) @@ -6669,29 +6866,35 @@ - func: sparse_mask(Tensor self, Tensor mask) -> Tensor variants: method dispatch: SparseCPU, SparseCUDA: sparse_mask - SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_csr + SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_compressed autogen: sparse_mask.out +- func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor + variants: method + dispatch: + SparseCPU, SparseCUDA: sparse_mask_projection + autogen: _sparse_mask_projection.out + - func: _to_cpu(Tensor[] tensors) -> Tensor[] variants: function -- func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor +- func: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor variants: method # Special case of to_dense with custom derivative -- func: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor +- func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor variants: method dispatch: SparseCPU, SparseCUDA: sparse_to_dense SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense MkldnnCPU: mkldnn_to_dense autogen: _to_dense.out -- func: to_dense_backward(Tensor grad, Tensor input) -> Tensor +- func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor - func: sparse_dim(Tensor self) -> int variants: method dispatch: CPU, CUDA: sparse_dim_strided @@ -6857,56 +7060,85 @@ - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[] variants: function, method - func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor variants: method + +# Special case of to_sparse.sparse_dim with custom derivative +- func: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor + variants: method dispatch: CPU, CUDA: dense_to_sparse SparseCPU, SparseCUDA: sparse_coo_to_sparse SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse - autogen: to_sparse.sparse_dim_out + autogen: _to_sparse.sparse_dim_out - func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor variants: method + +# Special case of to_sparse with custom derivative +- func: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor + variants: method dispatch: CPU, CUDA: dense_to_sparse SparseCPU, SparseCUDA: sparse_coo_to_sparse SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse - autogen: to_sparse.out + autogen: _to_sparse.out - func: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor variants: method + +# Special case of to_sparse_csr with custom derivative +- func: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor + variants: method dispatch: CPU, CUDA: dense_to_sparse_csr SparseCPU, SparseCUDA: coo_to_sparse_csr SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr - autogen: to_sparse_csr.out + autogen: _to_sparse_csr.out - func: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor variants: method + +# Special case of to_sparse_csc with custom derivative +- func: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor + variants: method dispatch: CPU, CUDA: dense_to_sparse_csc SparseCPU, SparseCUDA: coo_to_sparse_csc SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc - autogen: to_sparse_csc.out + autogen: _to_sparse_csc.out - func: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor variants: method + +# Special case of to_sparse_bsr with custom derivative +- func: _to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor + variants: method dispatch: CPU, CUDA: dense_to_sparse_bsr SparseCPU, SparseCUDA: coo_to_sparse_bsr SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr - autogen: to_sparse_bsr.out + autogen: _to_sparse_bsr.out - func: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor variants: method + +# Special case of to_sparse_bsc with custom derivative +- func: _to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor + variants: method dispatch: CPU, CUDA: dense_to_sparse_bsc SparseCPU, SparseCUDA: coo_to_sparse_bsc SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc - autogen: to_sparse_bsc.out + autogen: _to_sparse_bsc.out +- func: _to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: _to_sparse_semi_structured + - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor variants: method dispatch: CPU: dense_to_mkldnn autogen: to_mkldnn.out @@ -7172,11 +7404,11 @@ - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType variants: function # NB: Does NOT check precondition that numel == 1 - func: _local_scalar_dense(Tensor self) -> Scalar - tags: data_dependent_output + tags: [core, data_dependent_output] dispatch: CPU: _local_scalar_dense_cpu CUDA: _local_scalar_dense_cuda MPS: _local_scalar_dense_mps variants: function @@ -7185,12 +7417,13 @@ - func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) dispatch: MPS: _lstm_mps autogen: _lstm_mps.out + tags: nondeterministic_seeded -- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[]) +- func: lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[]) dispatch: MPS: lstm_mps_backward autogen: lstm_mps_backward.out @@ -7224,24 +7457,32 @@ - func: _thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) # RNN cells and layers - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor) + tags: nondeterministic_seeded - func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor) + tags: nondeterministic_seeded - func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) + tags: nondeterministic_seeded - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) + tags: nondeterministic_seeded - func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) + tags: nondeterministic_seeded - func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) + tags: nondeterministic_seeded - func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) + tags: nondeterministic_seeded - func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) + tags: nondeterministic_seeded - func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor) - func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor @@ -7380,10 +7621,11 @@ - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: masked_fill + NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill tags: pointwise - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method @@ -7404,10 +7646,11 @@ - func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!) variants: method dispatch: CPU: masked_scatter__cpu CUDA: masked_scatter__cuda + MPS: masked_scatter__mps autogen: masked_scatter.out - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor variants: function, method dispatch: @@ -7501,10 +7744,11 @@ device_check: NoCheck # TensorIterator variants: method dispatch: CPU: index_fill_ CUDA: index_fill_ + MPS: index_fill_mps_ autogen: index_fill.int_Scalar_out - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -7514,10 +7758,11 @@ - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: index_fill_ + MPS: index_fill_mps_ autogen: index_fill.int_Tensor_out - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -7541,10 +7786,11 @@ variants: function, method - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor structured_delegate: scatter.src_out variants: function, method + tags: core - func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) structured_delegate: scatter.src_out variants: method @@ -7556,10 +7802,11 @@ MPS: scatter_src_out_mps - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor structured_delegate: scatter.value_out variants: function, method + tags: core - func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) structured_delegate: scatter.value_out variants: method @@ -7655,10 +7902,11 @@ structured: True structured_inherits: TensorIteratorBase variants: function dispatch: CPU, CUDA: bitwise_and_out + MPS: bitwise_and_out_mps tags: pointwise - func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function @@ -7669,11 +7917,11 @@ - func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: CompositeExplicitAutograd: bitwise_and - tags: pointwise + tags: [core, pointwise] - func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function dispatch: @@ -7719,10 +7967,11 @@ structured: True structured_inherits: TensorIteratorBase variants: function dispatch: CPU, CUDA: bitwise_or_out + MPS: bitwise_or_out_mps tags: pointwise - func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function @@ -7731,11 +7980,11 @@ tags: pointwise - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function - tags: pointwise + tags: [core, pointwise] - func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function dispatch: @@ -7781,10 +8030,11 @@ structured: True structured_inherits: TensorIteratorBase variants: function dispatch: CPU, CUDA: bitwise_xor_out + MPS: bitwise_xor_out_mps tags: pointwise - func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function @@ -7793,11 +8043,11 @@ tags: pointwise - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function - tags: pointwise + tags: [core, pointwise] - func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function dispatch: @@ -8065,10 +8315,11 @@ device_check: NoCheck # TensorIterator tags: nondeterministic_seeded variants: method dispatch: CPU, CUDA: random_ + MPS: random_mps_ Meta: random_meta_ autogen: random, random.out - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -8162,11 +8413,11 @@ - func: trace(Tensor self) -> Tensor variants: method, function dispatch: CPU: trace_cpu CUDA: trace_cuda - MPS: trace_mps_out + MPS: trace_mps autogen: trace.out - func: trace_backward(Tensor grad, SymInt[] sizes) -> Tensor variants: function device_check: NoCheck @@ -8602,10 +8853,19 @@ CPU: nonzero_cpu CUDA: nonzero_cuda MPS: nonzero_mps tags: [dynamic_output_shape, core] +- func: nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: nonzero_static_out_cpu + +- func: nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor + variants: method, function + dispatch: + CPU: nonzero_static_cpu + - func: nonzero_numpy(Tensor self) -> Tensor[] variants: method, function - func: argwhere(Tensor self) -> Tensor variants: method, function @@ -8708,12 +8968,14 @@ variants: function dispatch: CPU, CUDA: linalg_solve_triangular MPS: linalg_solve_triangular_mps -- func: linalg_vander(Tensor x, *, int? N=None) -> Tensor +- func: linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor python_module: linalg + dispatch: + CompositeImplicitAutograd: linalg_vander_symint - func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) - func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V) variants: method, function @@ -8915,10 +9177,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: erfinv_out + MPS: erfinv_out_mps SparseCPU, SparseCUDA: erfinv_sparse_out SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out tags: pointwise - func: i0(Tensor self) -> Tensor @@ -8997,11 +9260,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: atan2_out - MPS: atan2_mps_out + MPS: atan2_out_mps tags: pointwise - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: atan2.out @@ -9028,18 +9291,20 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: lerp_Scalar + MPS: lerp_Scalar_mps tags: pointwise - func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: lerp_Tensor + MPS: lerp_Tensor_mps tags: pointwise - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -9052,50 +9317,50 @@ structured_delegate: lerp.Tensor_out tags: pointwise - func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: histogram_histc_cpu_out + CPU, MPS: histogram_histc_out CUDA: _histc_out_cuda - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor variants: method, function dispatch: - CPU: histogram_histc_cpu + CPU, MPS: histogram_histc CUDA: _histc_cuda - func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges) dispatch: - CPU: histogram_out_cpu + CPU, MPS: histogram_out - func: histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges) variants: method, function dispatch: - CPU: histogram_cpu + CPU, MPS: histogram - func: histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges) dispatch: - CPU: histogram_out_cpu + CPU, MPS: histogram_out - func: histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges) variants: method, function dispatch: - CPU: histogram_cpu + CPU, MPS: histogram - func: _histogramdd_bin_edges(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor[] dispatch: - CPU: histogramdd_bin_edges_cpu + CPU, MPS: histogramdd_bin_edges autogen: _histogramdd_bin_edges.out - func: _histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor dispatch: - CPU: histogramdd_cpu + CPU, MPS: _histogramdd autogen: _histogramdd_from_bin_cts.out - func: _histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor dispatch: - CPU: histogramdd_cpu + CPU, MPS: _histogramdd autogen: _histogramdd_from_bin_tensors.out - func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) - func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) @@ -9111,11 +9376,11 @@ - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: CompositeExplicitAutograd: fmod - tags: pointwise + tags: [core, pointwise] - func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: @@ -9146,10 +9411,11 @@ - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: hypot_out + MPS: hypot_out_mps tags: pointwise - func: hypot(Tensor self, Tensor other) -> Tensor structured_delegate: hypot.out variants: method, function @@ -9218,11 +9484,11 @@ - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor variants: method, function dispatch: CompositeExplicitAutograd: remainder - tags: pointwise + tags: [core, pointwise] - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) variants: method dispatch: CompositeExplicitAutograd: remainder_ @@ -9263,16 +9529,15 @@ dispatch: CPU, CUDA: min MPS: min_mps QuantizedCPU: min_quantized_cpu -# Not to be confused with binary op `min.out`. Commented because of failed CI -# FIXME: enable this -#- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) -# device_check: NoCheck # TensorIterator -# dispatch: -# CompositeExplicitAutograd: min_unary_out +- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: min_unary_out + QuantizedCPU: min_quantized_unary_out - func: fmin(Tensor self, Tensor other) -> Tensor structured_delegate: fmin.out device_check: NoCheck # TensorIterator variants: method, function @@ -9281,11 +9546,11 @@ - func: fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: fmin_out + CPU, CUDA, MPS: fmin_out tags: pointwise - func: max(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -9303,11 +9568,11 @@ - func: fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: fmax_out + CPU, CUDA, MPS: fmax_out tags: pointwise - func: maximum(Tensor self, Tensor other) -> Tensor structured_delegate: maximum.out device_check: NoCheck # TensorIterator @@ -9400,10 +9665,11 @@ - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) device_check: NoCheck # TensorIterator variants: method, function dispatch: CompositeExplicitAutograd: sort + tags: core - func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) structured_delegate: sort.values_stable variants: method, function dispatch: @@ -9436,18 +9702,18 @@ autogen: argsort.stable_out - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor variants: method, function -- func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) +- func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) structured: True dispatch: CPU: topk_out_cpu CUDA: topk_out_cuda MPS: topk_out_mps -- func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices) +- func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices) variants: method, function structured_delegate: topk.values dispatch: QuantizedCPU: topk_quantized_cpu tags: core @@ -9468,10 +9734,11 @@ device_check: NoCheck # TensorIterator structured_delegate: any.all_out variants: method, function dispatch: SparseCPU, SparseCUDA: any_sparse + tags: core - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck structured: True dispatch: @@ -9481,10 +9748,11 @@ - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True dispatch: CPU, CUDA: renorm_out + MPS: renorm_out_mps - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor device_check: NoCheck # TensorIterator variants: method, function structured_delegate: renorm.out @@ -9535,10 +9803,11 @@ - func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True dispatch: CPU, CUDA: pow_Scalar_out + MPS: pow_Scalar_out_mps tags: pointwise - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: pow.Scalar_out @@ -9609,10 +9878,11 @@ dispatch: CPU, CUDA: normal_ MPS: normal_mps_ Meta: normal_meta_ SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: normal_nested_ autogen: normal.out # Only used by the functionalization pass. # Normally, the codegen would be able to generate a normal() NativeFunction, # but we can't due to overload ambiguity with normal.Tensor_float. @@ -9718,160 +9988,159 @@ dispatch: CPU: foreach_tensor_add_scalar_kernel_slow_ CUDA: foreach_tensor_add_scalar_kernel_cuda_ autogen: _foreach_add.Scalar_out -- func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] +- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_scalar_kernel_slow - CUDA: foreach_tensor_sub_scalar_kernel_cuda + CPU: foreach_tensor_add_list_kernel_slow + CUDA: foreach_tensor_add_list_kernel_cuda -- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () +- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_scalar_kernel_slow_ - CUDA: foreach_tensor_sub_scalar_kernel_cuda_ - autogen: _foreach_sub.Scalar_out + CPU: foreach_tensor_add_list_kernel_slow_ + CUDA: foreach_tensor_add_list_kernel_cuda_ + autogen: _foreach_add.List_out -- func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] +- func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_scalar_kernel_slow - CUDA: foreach_tensor_mul_scalar_kernel_cuda + CPU: foreach_tensor_add_scalarlist_kernel_slow + CUDA: foreach_tensor_add_scalarlist_kernel_cuda -- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () +- func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_scalar_kernel_slow_ - CUDA: foreach_tensor_mul_scalar_kernel_cuda_ - autogen: _foreach_mul.Scalar_out + CPU: foreach_tensor_add_scalarlist_kernel_slow_ + CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ + autogen: _foreach_add.ScalarList_out -- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] +- func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_scalar_kernel_slow - CUDA: foreach_tensor_div_scalar_kernel_cuda + CPU: foreach_tensor_sub_scalar_kernel_slow + CUDA: foreach_tensor_sub_scalar_kernel_cuda -- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () +- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_scalar_kernel_slow_ - CUDA: foreach_tensor_div_scalar_kernel_cuda_ - autogen: _foreach_div.Scalar_out + CPU: foreach_tensor_sub_scalar_kernel_slow_ + CUDA: foreach_tensor_sub_scalar_kernel_cuda_ + autogen: _foreach_sub.Scalar_out -- func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] +- func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalar_kernel_slow - CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda + CPU: foreach_tensor_sub_list_kernel_slow + CUDA: foreach_tensor_sub_list_kernel_cuda -- func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () +- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalar_kernel_slow_ - CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ - autogen: _foreach_clamp_min.Scalar_out + CPU: foreach_tensor_sub_list_kernel_slow_ + CUDA: foreach_tensor_sub_list_kernel_cuda_ + autogen: _foreach_sub.List_out -- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] +- func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalar_kernel_slow - CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda + CPU: foreach_tensor_sub_scalarlist_kernel_slow + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda -- func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () +- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalar_kernel_slow_ - CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_ - autogen: _foreach_clamp_max.Scalar_out + CPU: foreach_tensor_sub_scalarlist_kernel_slow_ + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ + autogen: _foreach_sub.ScalarList_out -# foreach_minimum/maximum dispatches to clamp_max/min -- func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] +- func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalar_kernel_slow - CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda + CPU: foreach_tensor_mul_scalar_kernel_slow + CUDA: foreach_tensor_mul_scalar_kernel_cuda -- func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () +- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalar_kernel_slow_ - CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ - autogen: _foreach_maximum.Scalar_out + CPU: foreach_tensor_mul_scalar_kernel_slow_ + CUDA: foreach_tensor_mul_scalar_kernel_cuda_ + autogen: _foreach_mul.Scalar_out -- func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] +- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalar_kernel_slow - CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda + CPU: foreach_tensor_mul_list_kernel_slow + CUDA: foreach_tensor_mul_list_kernel_cuda -- func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () +- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalar_kernel_slow_ - CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_ - autogen: _foreach_minimum.Scalar_out + CPU: foreach_tensor_mul_list_kernel_slow_ + CUDA: foreach_tensor_mul_list_kernel_cuda_ + autogen: _foreach_mul.List_out -- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[] +- func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_list_kernel_slow - CUDA: foreach_tensor_add_list_kernel_cuda + CPU: foreach_tensor_mul_scalarlist_kernel_slow + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda -- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () +- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_list_kernel_slow_ - CUDA: foreach_tensor_add_list_kernel_cuda_ - autogen: _foreach_add.List_out + CPU: foreach_tensor_mul_scalarlist_kernel_slow_ + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ + autogen: _foreach_mul.ScalarList_out -- func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[] +- func: _foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_list_kernel_slow - CUDA: foreach_tensor_sub_list_kernel_cuda + CPU: foreach_tensor_mul_tensor_kernel_slow + CUDA: foreach_tensor_mul_tensor_kernel_cuda -- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () +- func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_list_kernel_slow_ - CUDA: foreach_tensor_sub_list_kernel_cuda_ - autogen: _foreach_sub.List_out + CPU: foreach_tensor_mul_tensor_kernel_slow_ + CUDA: foreach_tensor_mul_tensor_kernel_cuda_ + autogen: _foreach_mul.Tensor_out -- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[] +- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_list_kernel_slow - CUDA: foreach_tensor_mul_list_kernel_cuda + CPU: foreach_tensor_div_scalar_kernel_slow + CUDA: foreach_tensor_div_scalar_kernel_cuda -- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () +- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_list_kernel_slow_ - CUDA: foreach_tensor_mul_list_kernel_cuda_ - autogen: _foreach_mul.List_out + CPU: foreach_tensor_div_scalar_kernel_slow_ + CUDA: foreach_tensor_div_scalar_kernel_cuda_ + autogen: _foreach_div.Scalar_out - func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -9884,25 +10153,40 @@ dispatch: CPU: foreach_tensor_div_list_kernel_slow_ CUDA: foreach_tensor_div_list_kernel_cuda_ autogen: _foreach_div.List_out -- func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[] +- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_list_kernel_slow - CUDA: foreach_tensor_clamp_min_list_kernel_cuda + CPU: foreach_tensor_div_scalarlist_kernel_slow + CUDA: foreach_tensor_div_scalarlist_kernel_cuda -- func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> () +- func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_list_kernel_slow_ - CUDA: foreach_tensor_clamp_min_list_kernel_cuda_ - autogen: _foreach_clamp_min.List_out + CPU: foreach_tensor_div_scalarlist_kernel_slow_ + CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ + autogen: _foreach_div.ScalarList_out +- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_scalar_kernel_slow + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda + +- func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_clamp_max_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_ + autogen: _foreach_clamp_max.Scalar_out + - func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_clamp_max_list_kernel_slow @@ -9914,147 +10198,147 @@ dispatch: CPU: foreach_tensor_clamp_max_list_kernel_slow_ CUDA: foreach_tensor_clamp_max_list_kernel_cuda_ autogen: _foreach_clamp_max.List_out -# foreach_minimum/maximum dispatches to clamp_max/min -- func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[] +- func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_list_kernel_slow - CUDA: foreach_tensor_clamp_min_list_kernel_cuda + CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow + CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda -- func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> () +- func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_list_kernel_slow_ - CUDA: foreach_tensor_clamp_min_list_kernel_cuda_ - autogen: _foreach_maximum.List_out + CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_ + autogen: _foreach_clamp_max.ScalarList_out -- func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[] +- func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_list_kernel_slow - CUDA: foreach_tensor_clamp_max_list_kernel_cuda + CPU: foreach_tensor_clamp_min_scalar_kernel_slow + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda -- func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> () +- func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_list_kernel_slow_ - CUDA: foreach_tensor_clamp_max_list_kernel_cuda_ - autogen: _foreach_minimum.List_out + CPU: foreach_tensor_clamp_min_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ + autogen: _foreach_clamp_min.Scalar_out - -- func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] +- func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_scalarlist_kernel_slow - CUDA: foreach_tensor_add_scalarlist_kernel_cuda + CPU: foreach_tensor_clamp_min_list_kernel_slow + CUDA: foreach_tensor_clamp_min_list_kernel_cuda -- func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () +- func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_scalarlist_kernel_slow_ - CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ - autogen: _foreach_add.ScalarList_out + CPU: foreach_tensor_clamp_min_list_kernel_slow_ + CUDA: foreach_tensor_clamp_min_list_kernel_cuda_ + autogen: _foreach_clamp_min.List_out -- func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] +- func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_scalarlist_kernel_slow - CUDA: foreach_tensor_sub_scalarlist_kernel_cuda + CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow + CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda -- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () +- func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_scalarlist_kernel_slow_ - CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ - autogen: _foreach_sub.ScalarList_out + CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_ + autogen: _foreach_clamp_min.ScalarList_out -- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] +# foreach_minimum/maximum dispatches to clamp_max/min +- func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_scalarlist_kernel_slow - CUDA: foreach_tensor_div_scalarlist_kernel_cuda + CPU: foreach_tensor_clamp_min_scalar_kernel_slow + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda -- func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () +- func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_scalarlist_kernel_slow_ - CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ - autogen: _foreach_div.ScalarList_out + CPU: foreach_tensor_clamp_min_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ + autogen: _foreach_maximum.Scalar_out -- func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] +# foreach_minimum/maximum dispatches to clamp_max/min +- func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_scalarlist_kernel_slow - CUDA: foreach_tensor_mul_scalarlist_kernel_cuda + CPU: foreach_tensor_clamp_min_list_kernel_slow + CUDA: foreach_tensor_clamp_min_list_kernel_cuda -- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () +- func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_scalarlist_kernel_slow_ - CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ - autogen: _foreach_mul.ScalarList_out + CPU: foreach_tensor_clamp_min_list_kernel_slow_ + CUDA: foreach_tensor_clamp_min_list_kernel_cuda_ + autogen: _foreach_maximum.List_out -- func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] +# foreach_minimum/maximum dispatches to clamp_max/min +- func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda -- func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () +- func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_ CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_ - autogen: _foreach_clamp_min.ScalarList_out + autogen: _foreach_maximum.ScalarList_out -- func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] +- func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow - CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda + CPU: foreach_tensor_clamp_max_scalar_kernel_slow + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda -- func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () +- func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_ - CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_ - autogen: _foreach_clamp_max.ScalarList_out + CPU: foreach_tensor_clamp_max_scalar_kernel_slow_ + CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_ + autogen: _foreach_minimum.Scalar_out -# foreach_minimum/maximum dispatches to clamp_max/min -- func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] +- func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow - CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda + CPU: foreach_tensor_clamp_max_list_kernel_slow + CUDA: foreach_tensor_clamp_max_list_kernel_cuda -- func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () +- func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_ - CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_ - autogen: _foreach_maximum.ScalarList_out + CPU: foreach_tensor_clamp_max_list_kernel_slow_ + CUDA: foreach_tensor_clamp_max_list_kernel_cuda_ + autogen: _foreach_minimum.List_out - func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -10067,48 +10351,100 @@ dispatch: CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_ CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_ autogen: _foreach_minimum.ScalarList_out -- func: _foreach_exp(Tensor[] self) -> Tensor[] +- func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_exp_slow - CUDA: foreach_tensor_exp_cuda + CPU: foreach_tensor_addcdiv_scalar_slow + CUDA: foreach_tensor_addcdiv_scalar_cuda -- func: _foreach_zero_(Tensor(a!)[] self) -> () +- func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_zero_slow_ - CUDA: foreach_tensor_zero_cuda_ - autogen: _foreach_zero, _foreach_zero.out + CPU: foreach_tensor_addcdiv_scalarlist_slow + CUDA: foreach_tensor_addcdiv_scalarlist_cuda -- func: _foreach_exp_(Tensor(a!)[] self) -> () +- func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_exp_slow_ - CUDA: foreach_tensor_exp_cuda_ - autogen: _foreach_exp.out + CPU: foreach_tensor_addcdiv_tensor_slow + CUDA: foreach_tensor_addcdiv_tensor_cuda -- func: _foreach_sqrt(Tensor[] self) -> Tensor[] +- func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sqrt_slow - CUDA: foreach_tensor_sqrt_cuda + CPU: foreach_tensor_addcdiv_scalar_slow_ + CUDA: foreach_tensor_addcdiv_scalar_cuda_ + autogen: _foreach_addcdiv.Scalar_out -- func: _foreach_sqrt_(Tensor(a!)[] self) -> () +- func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sqrt_slow_ - CUDA: foreach_tensor_sqrt_cuda_ - autogen: _foreach_sqrt.out + CPU: foreach_tensor_addcdiv_scalarlist_slow_ + CUDA: foreach_tensor_addcdiv_scalarlist_cuda_ + autogen: _foreach_addcdiv.ScalarList_out +- func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcdiv_tensor_slow_ + CUDA: foreach_tensor_addcdiv_tensor_cuda_ + autogen: _foreach_addcdiv.Tensor_out + +- func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_scalar_slow + CUDA: foreach_tensor_addcmul_scalar_cuda + +- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_scalarlist_slow + CUDA: foreach_tensor_addcmul_scalarlist_cuda + +- func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_tensor_slow + CUDA: foreach_tensor_addcmul_tensor_cuda + +- func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_scalar_slow_ + CUDA: foreach_tensor_addcmul_scalar_cuda_ + autogen: _foreach_addcmul.Scalar_out + +- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_scalarlist_slow_ + CUDA: foreach_tensor_addcmul_scalarlist_cuda_ + autogen: _foreach_addcmul.ScalarList_out + +- func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_addcmul_tensor_slow_ + CUDA: foreach_tensor_addcmul_tensor_cuda_ + autogen: _foreach_addcmul.Tensor_out + - func: _foreach_abs(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_abs_slow @@ -10240,10 +10576,25 @@ dispatch: CPU: foreach_tensor_erfc_slow_ CUDA: foreach_tensor_erfc_cuda_ autogen: _foreach_erfc.out +- func: _foreach_exp(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_exp_slow + CUDA: foreach_tensor_exp_cuda + +- func: _foreach_exp_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_exp_slow_ + CUDA: foreach_tensor_exp_cuda_ + autogen: _foreach_exp.out + - func: _foreach_expm1(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_expm1_slow @@ -10270,10 +10621,72 @@ dispatch: CPU: foreach_tensor_floor_slow_ CUDA: foreach_tensor_floor_cuda_ autogen: _foreach_floor.out +- func: _foreach_frac(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_frac_slow + CUDA: foreach_tensor_frac_cuda + +- func: _foreach_frac_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_frac_slow_ + CUDA: foreach_tensor_frac_cuda_ + autogen: _foreach_frac.out + +- func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CPU: foreach_tensor_ternary_lerp_slow + CUDA: foreach_tensor_lerp_ternary_cuda + autogen: _foreach_lerp.List_out + +- func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CPU: foreach_tensor_ternary_lerp_slow_ + CUDA: foreach_tensor_lerp_ternary_cuda_ + autogen: _foreach_lerp.List_out + +- func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CPU: foreach_tensor_lerp_list_kernel_slow + CUDA: foreach_tensor_lerp_list_cuda + autogen: _foreach_lerp.Scalar_out + +- func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CPU: foreach_tensor_lerp_list_kernel_slow_ + CUDA: foreach_tensor_lerp_list_cuda_ + autogen: _foreach_lerp.Scalar_out + +- func: _foreach_lgamma(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_lgamma_slow + CUDA: foreach_tensor_lgamma_cuda + +- func: _foreach_lgamma_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_lgamma_slow_ + CUDA: foreach_tensor_lgamma_cuda_ + autogen: _foreach_lgamma.out + - func: _foreach_log(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_log_slow @@ -10345,69 +10758,84 @@ dispatch: CPU: foreach_tensor_neg_slow_ CUDA: foreach_tensor_neg_cuda_ autogen: _foreach_neg.out -- func: _foreach_tan(Tensor[] self) -> Tensor[] +- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_tan_slow - CUDA: foreach_tensor_tan_cuda + CPU: foreach_tensor_norm_slow + CUDA: foreach_tensor_norm_cuda + autogen: _foreach_norm.Scalar_out -- func: _foreach_tan_(Tensor(a!)[] self) -> () +- func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_tan_slow_ - CUDA: foreach_tensor_tan_cuda_ - autogen: _foreach_tan.out + CPU: foreach_tensor_pow_list_kernel_slow + CUDA: foreach_tensor_pow_list_kernel_cuda -- func: _foreach_tanh(Tensor[] self) -> Tensor[] +- func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_tanh_slow - CUDA: foreach_tensor_tanh_cuda + CPU: foreach_tensor_pow_scalar_kernel_slow + CUDA: foreach_tensor_pow_scalar_kernel_cuda -- func: _foreach_tanh_(Tensor(a!)[] self) -> () +- func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_tanh_slow_ - CUDA: foreach_tensor_tanh_cuda_ - autogen: _foreach_tanh.out + CPU: foreach_tensor_pow_scalarlist_kernel_slow + CUDA: foreach_tensor_pow_scalarlist_kernel_cuda -- func: _foreach_sin(Tensor[] self) -> Tensor[] +- func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sin_slow - CUDA: foreach_tensor_sin_cuda + CPU: foreach_scalar_pow_list_kernel_slow + CUDA: foreach_scalar_pow_list_kernel_cuda -- func: _foreach_sin_(Tensor(a!)[] self) -> () - device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices +- func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> () + device_check: NoCheck variants: function dispatch: - CPU: foreach_tensor_sin_slow_ - CUDA: foreach_tensor_sin_cuda_ - autogen: _foreach_sin.out + CPU: foreach_tensor_pow_list_kernel_slow_ + CUDA: foreach_tensor_pow_list_kernel_cuda_ + autogen: _foreach_pow.List_out -- func: _foreach_sinh(Tensor[] self) -> Tensor[] +- func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> () + device_check: NoCheck + variants: function + dispatch: + CPU: foreach_tensor_pow_scalar_kernel_slow_ + CUDA: foreach_tensor_pow_scalar_kernel_cuda_ + autogen: _foreach_pow.Scalar_out + +- func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> () + device_check: NoCheck + variants: function + dispatch: + CPU: foreach_tensor_pow_scalarlist_kernel_slow_ + CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_ + autogen: _foreach_pow.ScalarList_out + +- func: _foreach_reciprocal(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sinh_slow - CUDA: foreach_tensor_sinh_cuda + CPU: foreach_tensor_reciprocal_slow + CUDA: foreach_tensor_reciprocal_cuda -- func: _foreach_sinh_(Tensor(a!)[] self) -> () +- func: _foreach_reciprocal_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sinh_slow_ - CUDA: foreach_tensor_sinh_cuda_ - autogen: _foreach_sinh.out + CPU: foreach_tensor_reciprocal_slow_ + CUDA: foreach_tensor_reciprocal_cuda_ + autogen: _foreach_reciprocal.out - func: _foreach_round(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -10420,55 +10848,10 @@ dispatch: CPU: foreach_tensor_round_slow_ CUDA: foreach_tensor_round_cuda_ autogen: _foreach_round.out -- func: _foreach_lgamma(Tensor[] self) -> Tensor[] - device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices - variants: function - dispatch: - CPU: foreach_tensor_lgamma_slow - CUDA: foreach_tensor_lgamma_cuda - -- func: _foreach_lgamma_(Tensor(a!)[] self) -> () - device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices - variants: function - dispatch: - CPU: foreach_tensor_lgamma_slow_ - CUDA: foreach_tensor_lgamma_cuda_ - autogen: _foreach_lgamma.out - -- func: _foreach_frac(Tensor[] self) -> Tensor[] - device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices - variants: function - dispatch: - CPU: foreach_tensor_frac_slow - CUDA: foreach_tensor_frac_cuda - -- func: _foreach_frac_(Tensor(a!)[] self) -> () - device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices - variants: function - dispatch: - CPU: foreach_tensor_frac_slow_ - CUDA: foreach_tensor_frac_cuda_ - autogen: _foreach_frac.out - -- func: _foreach_reciprocal(Tensor[] self) -> Tensor[] - device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices - variants: function - dispatch: - CPU: foreach_tensor_reciprocal_slow - CUDA: foreach_tensor_reciprocal_cuda - -- func: _foreach_reciprocal_(Tensor(a!)[] self) -> () - device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices - variants: function - dispatch: - CPU: foreach_tensor_reciprocal_slow_ - CUDA: foreach_tensor_reciprocal_cuda_ - autogen: _foreach_reciprocal.out - - func: _foreach_sigmoid(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_sigmoid_slow @@ -10480,155 +10863,131 @@ dispatch: CPU: foreach_tensor_sigmoid_slow_ CUDA: foreach_tensor_sigmoid_cuda_ autogen: _foreach_sigmoid.out -- func: _foreach_trunc(Tensor[] self) -> Tensor[] +- func: _foreach_sign(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_trunc_slow - CUDA: foreach_tensor_trunc_cuda + CPU: foreach_tensor_sign_slow + CUDA: foreach_tensor_sign_cuda -- func: _foreach_trunc_(Tensor(a!)[] self) -> () +- func: _foreach_sign_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_trunc_slow_ - CUDA: foreach_tensor_trunc_cuda_ - autogen: _foreach_trunc.out + CPU: foreach_tensor_sign_slow_ + CUDA: foreach_tensor_sign_cuda_ + autogen: _foreach_sign.out -- func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () +- func: _foreach_sin(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_scalar_slow_ - CUDA: foreach_tensor_addcdiv_scalar_cuda_ - autogen: _foreach_addcdiv.Scalar_out + CPU: foreach_tensor_sin_slow + CUDA: foreach_tensor_sin_cuda -- func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () +- func: _foreach_sin_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_scalar_slow_ - CUDA: foreach_tensor_addcmul_scalar_cuda_ - autogen: _foreach_addcmul.Scalar_out + CPU: foreach_tensor_sin_slow_ + CUDA: foreach_tensor_sin_cuda_ + autogen: _foreach_sin.out -- func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () +- func: _foreach_sinh(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_scalarlist_slow_ - CUDA: foreach_tensor_addcdiv_scalarlist_cuda_ - autogen: _foreach_addcdiv.ScalarList_out + CPU: foreach_tensor_sinh_slow + CUDA: foreach_tensor_sinh_cuda -- func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> () +- func: _foreach_sinh_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_tensor_slow_ - CUDA: foreach_tensor_addcdiv_tensor_cuda_ - autogen: _foreach_addcdiv.Tensor_out + CPU: foreach_tensor_sinh_slow_ + CUDA: foreach_tensor_sinh_cuda_ + autogen: _foreach_sinh.out -- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () +- func: _foreach_sqrt(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_scalarlist_slow_ - CUDA: foreach_tensor_addcmul_scalarlist_cuda_ - autogen: _foreach_addcmul.ScalarList_out + CPU: foreach_tensor_sqrt_slow + CUDA: foreach_tensor_sqrt_cuda -- func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> () +- func: _foreach_sqrt_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_tensor_slow_ - CUDA: foreach_tensor_addcmul_tensor_cuda_ - autogen: _foreach_addcmul.Tensor_out + CPU: foreach_tensor_sqrt_slow_ + CUDA: foreach_tensor_sqrt_cuda_ + autogen: _foreach_sqrt.out -- func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] +- func: _foreach_tan(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_scalar_slow - CUDA: foreach_tensor_addcdiv_scalar_cuda + CPU: foreach_tensor_tan_slow + CUDA: foreach_tensor_tan_cuda -- func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] +- func: _foreach_tan_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_scalar_slow - CUDA: foreach_tensor_addcmul_scalar_cuda + CPU: foreach_tensor_tan_slow_ + CUDA: foreach_tensor_tan_cuda_ + autogen: _foreach_tan.out -- func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[] +- func: _foreach_tanh(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_scalarlist_slow - CUDA: foreach_tensor_addcdiv_scalarlist_cuda + CPU: foreach_tensor_tanh_slow + CUDA: foreach_tensor_tanh_cuda -- func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[] +- func: _foreach_tanh_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_tensor_slow - CUDA: foreach_tensor_addcdiv_tensor_cuda + CPU: foreach_tensor_tanh_slow_ + CUDA: foreach_tensor_tanh_cuda_ + autogen: _foreach_tanh.out -- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[] +- func: _foreach_trunc(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_scalarlist_slow - CUDA: foreach_tensor_addcmul_scalarlist_cuda + CPU: foreach_tensor_trunc_slow + CUDA: foreach_tensor_trunc_cuda -- func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[] +- func: _foreach_trunc_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_tensor_slow - CUDA: foreach_tensor_addcmul_tensor_cuda + CPU: foreach_tensor_trunc_slow_ + CUDA: foreach_tensor_trunc_cuda_ + autogen: _foreach_trunc.out -- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[] +- func: _foreach_zero_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_norm_slow - CUDA: foreach_tensor_norm_cuda - autogen: _foreach_norm.Scalar_out + CPU: foreach_tensor_zero_slow_ + CUDA: foreach_tensor_zero_cuda_ + autogen: _foreach_zero, _foreach_zero.out -- func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[] - device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices +- func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_ternary_lerp_slow - CUDA: foreach_tensor_lerp_ternary_cuda - autogen: _foreach_lerp.List_out + CPU: foreach_tensor_copy_list_kernel_slow_ + CUDA: foreach_tensor_copy_list_kernel_cuda_ + autogen: _foreach_copy, _foreach_copy.out -- func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> () - device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices - variants: function - dispatch: - CPU: foreach_tensor_ternary_lerp_slow_ - CUDA: foreach_tensor_lerp_ternary_cuda_ - autogen: _foreach_lerp.List_out - -- func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[] - device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices - variants: function - dispatch: - CPU: foreach_tensor_lerp_list_kernel_slow - CUDA: foreach_tensor_lerp_list_cuda - autogen: _foreach_lerp.Scalar_out - -- func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> () - device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices - variants: function - dispatch: - CPU: foreach_tensor_lerp_list_kernel_slow_ - CUDA: foreach_tensor_lerp_list_cuda_ - autogen: _foreach_lerp.Scalar_out - - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor dispatch: CPU: bucketize_cpu CUDA: bucketize_cuda @@ -10655,12 +11014,16 @@ - func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor dispatch: CPU: searchsorted_cpu CUDA: searchsorted_cuda - autogen: searchsorted.Scalar_out +- func: searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: searchsorted_out_cpu + CUDA: searchsorted_out_cuda + - func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor structured_delegate: _convert_indices_from_coo_to_csr.out - func: _convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!) structured: True @@ -10979,10 +11342,11 @@ structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator python_module: nn dispatch: CPU, CUDA: hardsigmoid_out + MPS: hardsigmoid_out_mps QuantizedCPU: hardsigmoid_out_quantized_cpu - func: hardsigmoid(Tensor self) -> Tensor structured_delegate: hardsigmoid.out device_check: NoCheck # TensorIterator @@ -10999,10 +11363,11 @@ structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: hardsigmoid_backward_out + MPS: hardsigmoid_backward_out_mps - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor structured_delegate: hardsigmoid_backward.grad_input python_module: nn @@ -11117,29 +11482,33 @@ device_check: NoCheck # TensorIterator python_module: nn dispatch: CPU: log_sigmoid_forward_out_cpu CUDA: log_sigmoid_forward_out_cuda + MPS: log_sigmoid_forward_out_mps - func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer) device_check: NoCheck # TensorIterator python_module: nn dispatch: CPU: log_sigmoid_forward_cpu CUDA: log_sigmoid_forward_cuda + MPS: log_sigmoid_forward_mps - func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: log_sigmoid_backward_cpu_out CUDA: log_sigmoid_backward_cuda_out + MPS: log_sigmoid_backward_mps_out - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor python_module: nn dispatch: CPU: log_sigmoid_backward_cpu CUDA: log_sigmoid_backward_cuda + MPS: log_sigmoid_backward_mps - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn tags: nondeterministic_seeded dispatch: @@ -11277,10 +11646,11 @@ dispatch: CPU: adaptive_avg_pool3d_cpu CUDA: adaptive_avg_pool3d_cuda QuantizedCPU: adaptive_avg_pool3d_quantized_cpu autogen: _adaptive_avg_pool3d.out + tags: core - func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: adaptive_avg_pool3d_backward_out_cpu @@ -11392,10 +11762,11 @@ python_module: nn structured_delegate: avg_pool3d.out dispatch: MkldnnCPU: mkldnn_avg_pool3d QuantizedCPU: avg_pool3d_quantized_cpu + tags: core - func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn structured: True dispatch: @@ -11515,29 +11886,29 @@ python_module: nn dispatch: CPU: max_pool3d_with_indices_backward_cpu CUDA: max_pool3d_with_indices_backward_cuda -- func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +- func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: max_unpooling2d_forward_out_cpu CUDA: max_unpooling2d_forward_out_cuda -- func: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor +- func: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor python_module: nn dispatch: CPU: max_unpooling2d_forward_cpu CUDA: max_unpooling2d_forward_cuda -- func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!) +- func: max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: max_unpooling3d_forward_out_cpu CUDA: max_unpooling3d_forward_out_cuda -- func: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor +- func: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor python_module: nn dispatch: CPU: max_unpooling3d_forward_cpu CUDA: max_unpooling3d_forward_cuda @@ -11551,10 +11922,11 @@ MPS: reflection_pad1d_out_mps - func: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor python_module: nn structured_delegate: reflection_pad1d.out + tags: core - func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn structured: True dispatch: @@ -11605,10 +11977,11 @@ MPS: reflection_pad3d_out_mps - func: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor python_module: nn structured_delegate: reflection_pad3d.out + tags: core - func: reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn structured: True dispatch: @@ -12067,10 +12440,11 @@ python_module: nn structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: logit_backward_out + MPS: logit_backward_out_mps tags: pointwise - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor python_module: nn structured_delegate: logit_backward.grad_input @@ -12713,161 +13087,233 @@ # # See fft_fft as an example. # torch.fft.fft # NOTE: NOT an alias for torch.fft, which has different semantics -- func: fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor +- func: fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_fft_symint -- func: fft_fft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_fft_symint_out -- func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor +- func: fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft_symint -- func: fft_ifft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft_symint_out -- func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor +- func: fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft_symint -- func: fft_rfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft_symint_out -- func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor +- func: fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft_symint -- func: fft_irfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft_symint_out -- func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor +- func: fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft_symint -- func: fft_hfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft_symint_out -- func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor +- func: fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft_symint -- func: fft_ihfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft_symint_out -- func: fft_fft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor +- func: fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_fft2_symint -- func: fft_fft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_fft2_symint_out -- func: fft_ifft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor +- func: fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft2_symint -- func: fft_ifft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ifft2_symint_out -- func: fft_rfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor +- func: fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft2_symint -- func: fft_rfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_rfft2_symint_out -- func: fft_irfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor +- func: fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft2_symint -- func: fft_irfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_irfft2_symint_out -- func: fft_hfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor +- func: fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor use_const_ref_for_mutable_tensors: True python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft2_symint -- func: fft_hfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) use_const_ref_for_mutable_tensors: True python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_hfft2_symint_out -- func: fft_ihfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor +- func: fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor use_const_ref_for_mutable_tensors: True python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft2_symint -- func: fft_ihfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) use_const_ref_for_mutable_tensors: True python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfft2_symint_out -- func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor +- func: fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_fftn_symint -- func: fft_fftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_fftn_symint_out -- func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor +- func: fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ifftn_symint -- func: fft_ifftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ifftn_symint_out -- func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor +- func: fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_rfftn_symint -- func: fft_rfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_rfftn_symint_out -- func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor +- func: fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_irfftn_symint -- func: fft_irfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_irfftn_symint_out -- func: fft_hfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor +- func: fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor use_const_ref_for_mutable_tensors: True python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_hfftn_symint -- func: fft_hfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) use_const_ref_for_mutable_tensors: True python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_hfftn_symint_out -- func: fft_ihfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor +- func: fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor use_const_ref_for_mutable_tensors: True python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfftn_symint -- func: fft_ihfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +- func: fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) use_const_ref_for_mutable_tensors: True python_module: fft variants: function + dispatch: + CompositeImplicitAutograd: fft_ihfftn_symint_out - func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor python_module: fft variants: function dispatch: @@ -13208,10 +13654,11 @@ - func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) python_module: linalg structured: True dispatch: CPU, CUDA: linalg_vector_norm_out + MPS: linalg_vector_norm_out_mps - func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor python_module: linalg - func: linalg_matrix_norm.out(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) @@ -13786,10 +14233,11 @@ - func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor dispatch: NestedTensorCPU: NestedTensor_softmax_dropout NestedTensorCUDA: NestedTensor_softmax_dropout_cuda + tags: nondeterministic_seeded # Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is. - func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor variants: function dispatch: @@ -13801,81 +14249,92 @@ dispatch: CPU, NestedTensorCPU: native_multi_head_attention_cpu CUDA, NestedTensorCUDA: native_multi_head_attention_cuda autogen: _native_multi_head_attention.out -- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False) -> Tensor +- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor python_module: nn variants: function autogen: scaled_dot_product_attention.out + tags: nondeterministic_seeded -# TODO: THIS NEEDS TO BE REMOVED BUT PEOPLE HAVE TRAINED THEIR MODELS WITH THIS OP BUILTIN -- func: _scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor) - python_module: nn - variants: function - autogen: _scaled_dot_product_attention.out - # This aten function is kept so that we can test the choice function from Python -- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False) -> int +- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int dispatch: Meta: _fused_sdp_choice_meta CPU, NestedTensorCPU: _fused_sdp_choice_cpp CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda + tags: nondeterministic_seeded -- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None) -> (Tensor, Tensor) +- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor) variants: function + tags: nondeterministic_seeded -- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, int philox_seed, int philox_offset, Tensor debug_attn_mask) +- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) dispatch: + CPU: _scaled_dot_product_flash_attention_cpu CUDA: _scaled_dot_product_flash_attention_cuda NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda + tags: nondeterministic_seeded -- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, int philox_seed, int philox_offset) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value) +- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value) + device_check: NoCheck variants: function dispatch: + CPU: _scaled_dot_product_flash_attention_backward_cpu CUDA: _scaled_dot_product_flash_attention_backward_cuda -- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor) +- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset) dispatch: CUDA: _scaled_dot_product_efficient_attention_cuda NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda + tags: nondeterministic_seeded -- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False, bool chunk_grad_outputs=False) -> (Tensor, Tensor, Tensor) +- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor) + device_check: NoCheck dispatch: CUDA: _scaled_dot_product_efficient_attention_backward_cuda + tags: nondeterministic_seeded -- func: _chunk_grad_outputs_efficient_attention(Tensor query, Tensor key, Tensor value, bool is_causal=False) -> bool - dispatch: - CUDA: _chunk_grad_outputs_efficient_attention - -- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask) -> (Tensor output, Tensor softmax_logsumexp, int philox_seed, int philox_offset, Tensor debug_attn_mask) +- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) variants: function dispatch: CUDA: _flash_attention_forward + tags: nondeterministic_seeded -- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, int philox_seed, int philox_offset) -> (Tensor, Tensor, Tensor) +- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor) + device_check: NoCheck variants: function dispatch: CUDA: _flash_attention_backward # Returns ouput, logsumexp if compute_logsumexp -- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor) +- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset) variants: function dispatch: CUDA: _efficient_attention_forward + tags: nondeterministic_seeded -- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False, bool chunk_grad_outputs=False) -> (Tensor, Tensor, Tensor) +- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_k, int max_seqlen_q, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor) + device_check: NoCheck variants: function dispatch: CUDA: _efficient_attention_backward - func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor variants: function dispatch: CUDA: triton_scaled_dot_attention + tags: nondeterministic_seeded autogen: _triton_scaled_dot_attention.out +- func: _fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!) + variants: function + dispatch: + CUDA: _fill_mem_eff_dropout_mask_ + tags: nondeterministic_seeded + - func: _triton_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor variants: function dispatch: CUDA: triton_multi_head_attention autogen: _triton_multi_head_attention.out @@ -13893,22 +14352,10 @@ structured_inherits: TensorIteratorBase structured: True variants: function tags: pointwise -- func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor) - variants: function - dispatch: - CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_decoder_only_layer_forward - autogen: _transformer_decoder_only_layer_fwd.out - -- func: _native_decoder_only_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None, bool need_weights=True, bool average_attn_weights=True) -> (Tensor, Tensor, Tensor, Tensor) - variants: function - dispatch: - CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_decoder_only_multi_head_attention - autogen: _native_decoder_only_multi_head_attention.out - - func: special_bessel_j0(Tensor self) -> Tensor python_module: special structured_delegate: special_bessel_j0.out variants: function tags: pointwise @@ -14601,11 +15048,33 @@ variants: function dispatch: CUDA: _fused_adam_kernel_cuda_ autogen: _fused_adam, _fused_adam.out +- func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now), + # but still skip the device check as the Tensor LR can be on CPU + device_check: NoCheck + variants: function + dispatch: + CUDA: _fused_adam_kernel_cuda_ + autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out + - func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). variants: function dispatch: CUDA: _fused_adamw_kernel_cuda_ autogen: _fused_adamw, _fused_adamw.out + +- func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now), + # but still skip the device check as the Tensor LR can be on CPU + device_check: NoCheck + variants: function + dispatch: + CUDA: _fused_adamw_kernel_cuda_ + autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out + +# This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts. +- func: _propagate_xla_data(Tensor input, Tensor output) -> () + variants: function