codegen/native_functions.yaml in torch-rb-0.15.0 vs codegen/native_functions.yaml in torch-rb-0.16.0
- old
+ new
@@ -132,11 +132,11 @@
dispatch:
CompositeExplicitAutograd: _new_zeros_with_same_feature_meta
autogen: _new_zeros_with_same_feature_meta.out
# This function compares the storage numel of self with that of other, where
-# storage numel is cumputed as: `other.storage().nbytes() / other.itemsize()`.
+# storage numel is computed as: `other.storage().nbytes() / other.itemsize()`.
# We create this function for composite compliance purposes. The batching rule
# always returns true because vmapped as_strided does not support accessing
# storage locations not indexable by the input tensor.
# See the note above for more information.
- func: _has_same_storage_numel(Tensor self, Tensor other) -> bool
@@ -173,16 +173,28 @@
- func: _assert_async.msg(Tensor self, str assert_msg) -> ()
dispatch:
CPU: _assert_async_msg_cpu
CUDA: _assert_async_msg_cuda
+- func: _assert_scalar(Scalar self, str assert_msg) -> ()
+ dispatch:
+ CompositeExplicitAutograd: _assert_scalar
+
+- func: _functional_assert_scalar(Scalar self, str assert_msg, Tensor dep_token) -> Tensor
+ dispatch:
+ CompositeExplicitAutograd: _functional_assert_scalar
+
- func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor
dispatch:
CPU: _functional_assert_async_msg_cpu
- func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
+- func: _print(str s) -> ()
+ dispatch:
+ CompositeExplicitAutograd: _print
+
- func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()
dispatch:
CompositeExplicitAutograd: sym_constrain_range
- func: sym_constrain_range_for_size(Scalar size, *, int? min=None, int? max=None) -> ()
@@ -468,10 +480,11 @@
tags: pointwise
- func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA: conj_physical_out
+ MPS: conj_physical_out_mps
SparseCPU, SparseCUDA: conj_physical_out_sparse
SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out
tags: pointwise
- func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
@@ -562,12 +575,12 @@
Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
ScalarOnly: add (Bool)
dispatch:
SparseCPU: add_out_sparse_cpu
SparseCUDA: add_out_sparse_cuda
- SparseCsrCPU: add_out_sparse_csr_cpu
- SparseCsrCUDA: add_out_sparse_csr_cuda
+ SparseCsrCPU: add_out_sparse_compressed_cpu
+ SparseCsrCUDA: add_out_sparse_compressed_cuda
MkldnnCPU: mkldnn_add_out
MPS: add_out_mps
tags: pointwise
- func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@@ -761,11 +774,11 @@
- func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
CompositeExplicitAutograd: arange
-# This operator should be named `aragne.start_out` if following the naming convention. However that
+# This operator should be named `arange.start_out` if following the naming convention. However that
# name is already taken. Disabled because of CI job failures.
# FIXME: enable this
#- func: arange.start_out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
# dispatch:
# CompositeExplicitAutograd: arange_start_out
@@ -1218,10 +1231,17 @@
- func: copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CompositeExplicitAutograd: copysign_out
tags: pointwise
+- func: _lazy_clone(Tensor self) -> Tensor
+ # Like clone, but the copy takes place lazily, only if either the
+ # input or the output are written.
+ variants: function, method
+ dispatch:
+ CompositeExplicitAutograd: _lazy_clone
+
- func: logical_not(Tensor self) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: logical_not
@@ -1619,10 +1639,11 @@
CompositeExplicitAutograd: complex
- func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA: complex_out
+ MPS: complex_out_mps
- func: polar(Tensor abs, Tensor angle) -> Tensor
variants: function
dispatch:
CompositeExplicitAutograd: polar
@@ -1845,12 +1866,15 @@
autogen: cudnn_batch_norm_backward.out
- func: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
dispatch:
CUDA: cudnn_convolution
- autogen: cudnn_convolution.out
+- func: cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+ dispatch:
+ CUDA: cudnn_convolution_out
+
- func: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
dispatch:
CUDA: cudnn_convolution_transpose
autogen: cudnn_convolution_transpose.out
@@ -2344,11 +2368,11 @@
CUDA: empty_cuda
MPS: empty_mps
Meta: empty_meta_symint
MkldnnCPU: empty_mkldnn
SparseCPU, SparseCUDA, SparseMeta: empty_sparse
- SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_sparse_compressed
QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
tags: core
- func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
@@ -2450,11 +2474,11 @@
device_guard: False
dispatch:
CompositeExplicitAutograd: empty_like
QuantizedCPU, QuantizedCUDA: empty_like_quantized
SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
- SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
NestedTensorCPU, NestedTensorCUDA: empty_like_nested
autogen: empty_like.out
- func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
@@ -2952,42 +2976,48 @@
- func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
variants: function
dispatch:
CPU: _fft_r2c_mkl
CUDA: _fft_r2c_cufft
+ MPS: _fft_r2c_mps
- func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
variants: function
dispatch:
CPU: _fft_r2c_mkl_out
CUDA: _fft_r2c_cufft_out
+ MPS: _fft_r2c_mps_out
# Complex to real inverse FFT
- func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
variants: function
dispatch:
CPU: _fft_c2r_mkl
CUDA: _fft_c2r_cufft
+ MPS: _fft_c2r_mps
- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
variants: function
dispatch:
CPU: _fft_c2r_mkl_out
CUDA: _fft_c2r_cufft_out
+ MPS: _fft_c2r_mps_out
# Standard complex to complex FFT (forward or backward)
- func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
variants: function
dispatch:
CPU: _fft_c2c_mkl
CUDA: _fft_c2c_cufft
+ MPS: _fft_c2c_mps
- func: _fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
variants: function
dispatch:
CPU: _fft_c2c_mkl_out
CUDA: _fft_c2c_cufft_out
+ MPS: _fft_c2c_mps_out
- func: _validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
device_check: NoCheck
variants: function
dispatch:
@@ -3300,16 +3330,20 @@
- func: _cslt_compress(Tensor input) -> Tensor
dispatch:
CUDA: _cslt_compress
-- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> Tensor
+- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor
dispatch:
CUDA: _cslt_sparse_mm
-- func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None) -> Tensor
+- func: _cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int
dispatch:
+ CUDA: _cslt_sparse_mm_search
+
+- func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor
+ dispatch:
CUDA: _sparse_semi_structured_linear
- func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor
dispatch:
CUDA: _mixed_dtypes_linear
@@ -4056,16 +4090,22 @@
dispatch:
CUDA: _int_mm_out_cuda
- func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
dispatch:
+ CPU: _convert_weight_to_int4pack_cpu
CUDA: _convert_weight_to_int4pack_cuda
- func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
dispatch:
+ CPU: _weight_int4pack_mm_cpu
CUDA: _weight_int4pack_mm_cuda
+- func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
+ dispatch:
+ CPU: _weight_int8pack_mm_cpu
+
- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
python_module: sparse
- func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor
python_module: sparse
@@ -4437,11 +4477,10 @@
dispatch:
CPU: pixel_shuffle_cpu
MPS: pixel_shuffle_mps
CompositeExplicitAutogradNonFunctional: math_pixel_shuffle
autogen: pixel_shuffle.out
- tags: core
- func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
dispatch:
CPU: pixel_unshuffle_cpu
MPS: pixel_unshuffle_mps
@@ -4808,11 +4847,11 @@
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
CompositeImplicitAutograd: reshape_symint
- CompositeImplicitAutogradNestedTensor: reshape_nested
+ CompositeImplicitAutogradNestedTensor: reshape_nested_symint
- func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
variants: function
dispatch:
CompositeExplicitAutograd: _reshape_copy_symint
@@ -4967,10 +5006,11 @@
- func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)
structured_delegate: gelu.out
device_check: NoCheck # TensorIterator
python_module: nn
dispatch:
+ QuantizedCPU: gelu_quantized_cpu_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
- func: gelu(Tensor self, *, str approximate='none') -> Tensor
structured_delegate: gelu.out
device_check: NoCheck # TensorIterator
@@ -5354,18 +5394,33 @@
device_guard: False
dispatch:
CompositeExplicitAutograd: slice_backward
autogen: slice_backward.out
+# NB: This op exists to back the implementation of reverse view_funcs for various views (chunk,
+# slice.Tensor, split_with_sizes, et. al.). Currently, these are only used during fake-ification
+# of PT2 graph input subclass instances that are views. This means:
+# * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it)
+# * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it)
+# * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph
+# input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is
+# easier to implement for a subclass than as_strided()
+- func: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+ variants: function, method
+ device_check: NoCheck
+ device_guard: False
+ dispatch:
+ CompositeExplicitAutograd: slice_inverse_symint
+
- func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
variants: function, method
device_check: NoCheck
device_guard: False
dispatch:
CompositeExplicitAutogradNonFunctional: slice_scatter
autogen: slice_scatter.out
- tags: core
+ tags: [core, view_copy]
- func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
variants: function, method
device_check: NoCheck
device_guard: False
@@ -5560,10 +5615,18 @@
CPU: _sspaddmm_out_only_sparse
CUDA: _sspaddmm_out_only_sparse_cuda
SparseCPU: _sspaddmm_out_cpu
SparseCUDA: _sspaddmm_out_cuda
+- func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor
+ dispatch:
+ CompositeExplicitAutograd: _chunk_cat
+
+- func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
+ dispatch:
+ CompositeExplicitAutograd: _chunk_cat_out
+
- func: stack(Tensor[] tensors, int dim=0) -> Tensor
dispatch:
CompositeExplicitAutograd: stack
- func: stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
@@ -5751,10 +5814,11 @@
- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
dispatch:
CPU, CUDA: std_mean
+ MPS: std_mean_mps
autogen: std_mean.correction_out
- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
@@ -6006,11 +6070,10 @@
variants: function, method
dispatch:
CPU, MPS: roll
CUDA: roll_cuda
autogen: roll.out
- tags: core
# default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
- func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
variants: function, method
@@ -6089,10 +6152,56 @@
tags: view_copy
dispatch:
CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy
autogen: _nested_view_from_buffer_copy.out
+- func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a)
+ variants: function
+ device_check: NoCheck
+ dispatch: {}
+
+- func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor
+ variants: function
+ device_check: NoCheck
+ tags: view_copy
+ dispatch:
+ CompositeExplicitAutogradNonFunctional: _nested_view_from_jagged_copy
+ autogen: _nested_view_from_jagged_copy.out
+
+- func: _nested_get_values(Tensor(a) self) -> Tensor(a)
+ variants: function
+ device_check: NoCheck
+ dispatch: {}
+
+- func: _nested_get_values_copy(Tensor self) -> Tensor
+ variants: function
+ device_check: NoCheck
+ tags: view_copy
+ dispatch:
+ CompositeExplicitAutogradNonFunctional: _nested_get_values_copy
+ autogen: _nested_get_values_copy.out
+
+- func: _nested_get_offsets(Tensor self) -> Tensor
+ variants: function
+ device_check: NoCheck
+ dispatch: {}
+
+# returns undefined Tensor if no lengths present
+- func: _nested_get_lengths(Tensor self) -> Tensor
+ variants: function
+ device_check: NoCheck
+ dispatch: {}
+
+- func: _nested_get_ragged_idx(Tensor self) -> int
+ variants: function
+ device_check: NoCheck
+ dispatch: {}
+
+- func: _nested_get_jagged_dummy(Tensor any) -> Tensor
+ category_override: dummy
+ dispatch: {}
+
- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
dispatch:
# calls unsqueeze
CompositeExplicitAutogradNonFunctional: _trilinear
autogen: _trilinear.out
@@ -6273,10 +6382,11 @@
- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
dispatch:
CPU, CUDA: var_mean
+ MPS: var_mean_mps
autogen: var_mean.correction_out
- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
@@ -6293,19 +6403,17 @@
- func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
- CPU, CUDA: where
- MPS: where_mps
+ CPU, CUDA, MPS: where
tags: [core, pointwise]
- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
- CPU, CUDA: where_self_out
- MPS: where_self_out_mps
+ CPU, CUDA, MPS: where_self_out
- func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
variants: function
- func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor
@@ -6642,11 +6750,11 @@
dispatch:
CPU, CUDA: zero_
MPS: zero_mps_
Meta: zero_meta_
SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
- SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
MkldnnCPU: mkldnn_zero_
NestedTensorCPU, NestedTensorCUDA: zero_nested_
autogen: zero, zero.out
- func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -6932,11 +7040,11 @@
# shared. In other words, their outputs are non-differentiable views of the
# sparse tensor.
# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
# the default would never make sense.
-- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
dispatch:
CompositeExplicitAutograd: sparse_compressed_tensor
- func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
@@ -6949,11 +7057,14 @@
- func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
- func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
-- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+ dispatch:
+ CompositeImplicitAutograd: _sparse_compressed_tensor_unsafe_symint
+
- func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
- func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
- func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
- func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -7035,11 +7146,11 @@
- func: sparse_dim(Tensor self) -> int
variants: method
dispatch:
CPU, CUDA: sparse_dim_strided
SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse
- SparseCsrCPU, SparseCsrCUDA: sparse_dim_sparse_csr
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr
device_check: NoCheck
device_guard: False
# legacy method
- func: _dimI(Tensor self) -> int
@@ -7052,11 +7163,11 @@
- func: dense_dim(Tensor self) -> int
variants: method
dispatch:
CPU, CUDA: dense_dim_strided
SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
- SparseCsrCPU, SparseCsrCUDA: dense_dim_sparse_csr
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
device_check: NoCheck
device_guard: False
# legacy method
- func: _dimV(Tensor self) -> int
@@ -7068,11 +7179,11 @@
- func: _nnz(Tensor self) -> int
variants: method
dispatch:
SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse
- SparseCsrCPU, SparseCsrCUDA: _nnz_sparse_csr
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _nnz_sparse_csr
device_check: NoCheck
device_guard: False
# NOTE: [ coalesce autograd ]
# coalesce returns self directly for already coalesced sparse tensors.
@@ -7131,44 +7242,44 @@
- func: values(Tensor(a) self) -> Tensor(a)
variants: method
dispatch:
SparseCPU, SparseCUDA, SparseMeta: values_sparse
- SparseCsrCPU, SparseCsrCUDA: values_sparse_csr
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
NestedTensorCPU, NestedTensorCUDA: values_nested
CompositeExplicitAutograd: values_default
device_check: NoCheck
device_guard: False
- func: crow_indices(Tensor(a) self) -> Tensor(a)
variants: method
dispatch:
- SparseCsrCPU, SparseCsrCUDA: crow_indices_sparse_csr
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: crow_indices_sparse_csr
CompositeExplicitAutograd: crow_indices_default
device_check: NoCheck
device_guard: False
- func: col_indices(Tensor(a) self) -> Tensor(a)
variants: method
dispatch:
- SparseCsrCPU, SparseCsrCUDA: col_indices_sparse_csr
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: col_indices_sparse_csr
CompositeExplicitAutograd: col_indices_default
device_check: NoCheck
device_guard: False
- func: ccol_indices(Tensor(a) self) -> Tensor(a)
variants: method
dispatch:
- SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ccol_indices_sparse_csr
CompositeExplicitAutograd: ccol_indices_default
device_check: NoCheck
device_guard: False
- func: row_indices(Tensor(a) self) -> Tensor(a)
variants: method
dispatch:
- SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: row_indices_sparse_csr
CompositeExplicitAutograd: row_indices_default
device_check: NoCheck
device_guard: False
- func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
@@ -7673,10 +7784,11 @@
device_check: NoCheck
device_guard: False
dispatch:
CPU, CUDA, Meta, MPS: set_
autogen: set.source_Storage, set.source_Storage_out
+ tags: inplace_view
- func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
variants: method
device_check: NoCheck
device_guard: False
@@ -7685,34 +7797,38 @@
Meta: set_storage_meta__symint
CUDA: set_storage_cuda_
MPS: set_storage_mps_
QuantizedCPU, QuantizedCUDA: set_storage_quantized_
autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out
+ tags: inplace_view
- func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
variants: method
device_check: NoCheck
device_guard: False
dispatch:
CompositeImplicitAutograd: set__symint
+ tags: inplace_view
- func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
variants: method
device_check: NoCheck
device_guard: False
dispatch:
CPU, CUDA, Meta, MPS: set_tensor_
autogen: set.source_Tensor, set.source_Tensor_out
+ tags: inplace_view
- func: set_(Tensor(a!) self) -> Tensor(a!)
variants: method
dispatch:
CPU: set_cpu_
CUDA: set_cuda_
Meta: set_meta_
MPS: set_mps_
autogen: set, set.out
+ tags: inplace_view
# Not making it CompositeImplicitAutograd because lift
# should be a primitive w.r.t. functorch
# TODO: this should have a view annotation
@@ -10110,16 +10226,18 @@
- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
variants: function
dispatch:
CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
+ CPU: _amp_foreach_non_finite_check_and_unscale_cpu_
autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
- func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
variants: function
dispatch:
CUDA: _amp_update_scale_cuda_
+ CPU: _amp_update_scale_cpu_
autogen: _amp_update_scale, _amp_update_scale.out
#- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
#dispatch:
#CPU: _cat_cpu
@@ -12339,10 +12457,11 @@
python_module: nn
structured: True
dispatch:
CPU: upsample_linear1d_out_cpu
CUDA: upsample_linear1d_out_cuda
+ MPS: upsample_linear1d_out_mps
- func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
python_module: nn
structured_delegate: upsample_linear1d.out
@@ -12350,10 +12469,11 @@
python_module: nn
structured: True
dispatch:
CPU: upsample_linear1d_backward_out_cpu
CUDA: upsample_linear1d_backward_out_cuda
+ MPS: upsample_linear1d_backward_out_mps
- func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
python_module: nn
structured_delegate: upsample_linear1d_backward.grad_input
@@ -12822,11 +12942,11 @@
CompositeExplicitAutograd: isinf
SparseCPU, SparseCUDA: isinf_sparse
SparseMeta: isinf_sparse_meta
SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr
autogen: isinf.out
- tags: core
+ tags: [core, pointwise]
- func: record_stream(Tensor(a!) self, Stream s) -> ()
variants: method
dispatch:
CUDA: record_stream_cuda
@@ -13748,15 +13868,22 @@
- func: linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
python_module: linalg
dispatch:
CPU, CUDA: linalg_eig_out
+- func: _linalg_eigvals(Tensor self) -> Tensor
+ python_module: linalg
+ dispatch:
+ CPU, CUDA: _linalg_eigvals
+
- func: linalg_eigvals(Tensor self) -> Tensor
python_module: linalg
- func: linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
python_module: linalg
+ dispatch:
+ CPU, CUDA: linalg_eigvals_out
# This function is exposes the `compute_v` flag, which is then used to implement `linalg.eigh` and
# `linalg.eigvalsh` as composite functions that call this one
- func: _linalg_eigh(Tensor A, str UPLO="L", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors)
structured_delegate: _linalg_eigh.eigenvalues
@@ -14056,10 +14183,16 @@
## Functions that are only for testing
# It is undocumented and should not be used outside of tests.
- func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
+# Note: for testing COW materialization within `at::parallel_for` loop function
+- func: _test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor
+ variants: function
+ dispatch:
+ CompositeExplicitAutograd: _test_parallel_materialize
+
# Note: this function is only for testing.
- func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor
python_module: nn
dispatch:
CPU: _test_optional_intlist
@@ -14390,10 +14523,11 @@
- func: split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
variants: function
dispatch:
CompositeExplicitAutograd: split_with_sizes_copy_out
+ CUDA: split_with_sizes_copy_out_cuda
- func: view_copy(Tensor self, SymInt[] size) -> Tensor
variants: function
dispatch:
CompositeExplicitAutogradNonFunctional: view_copy_symint
@@ -14466,23 +14600,32 @@
variants: function
tags: nondeterministic_seeded
- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
dispatch:
- CPU: _scaled_dot_product_flash_attention_cpu
CUDA: _scaled_dot_product_flash_attention_cuda
NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
tags: nondeterministic_seeded
+- func: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp)
+ dispatch:
+ CPU: _scaled_dot_product_flash_attention_cpu
+ tags: nondeterministic_seeded
+
- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
device_check: NoCheck
variants: function
dispatch:
- CPU: _scaled_dot_product_flash_attention_backward_cpu
CUDA: _scaled_dot_product_flash_attention_backward_cuda
NestedTensorCUDA: _scaled_dot_product_flash_attention_backward_nested
+- func: _scaled_dot_product_flash_attention_for_cpu_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, float dropout_p, bool is_causal, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+ device_check: NoCheck
+ variants: function
+ dispatch:
+ CPU: _scaled_dot_product_flash_attention_cpu_backward
+
- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
dispatch:
CUDA: _scaled_dot_product_efficient_attention_cuda
NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda
tags: nondeterministic_seeded
@@ -14491,10 +14634,15 @@
device_check: NoCheck
dispatch:
CUDA: _scaled_dot_product_efficient_attention_backward_cuda
tags: nondeterministic_seeded
+- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
+ dispatch:
+ CUDA: _scaled_dot_product_cudnn_attention_cuda
+ tags: nondeterministic_seeded
+
- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
variants: function
dispatch:
CUDA: _flash_attention_forward
tags: nondeterministic_seeded
@@ -14503,12 +14651,12 @@
device_check: NoCheck
variants: function
dispatch:
CUDA: _flash_attention_backward
-# Returns ouput, logsumexp if compute_logsumexp
-- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
+# Returns output, logsumexp if compute_logsumexp
+- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
variants: function
dispatch:
CUDA: _efficient_attention_forward
tags: nondeterministic_seeded
@@ -15342,9 +15490,25 @@
device_check: NoCheck
variants: function
dispatch:
CUDA: _fused_adamw_kernel_cuda_
autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
+
+- func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+ variants: function
+ dispatch:
+ CUDA: _fused_sgd_kernel_cuda_
+ autogen: _fused_sgd, _fused_sgd.out
+
+- func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+ # but still skip the device check as the Tensor LR can be on CPU
+ device_check: NoCheck
+ variants: function
+ dispatch:
+ CUDA: _fused_sgd_kernel_cuda_
+ autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
# This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
- func: _propagate_xla_data(Tensor input, Tensor output) -> ()
variants: function