codegen/native_functions.yaml in torch-rb-0.15.0 vs codegen/native_functions.yaml in torch-rb-0.16.0

- old
+ new

@@ -132,11 +132,11 @@ dispatch: CompositeExplicitAutograd: _new_zeros_with_same_feature_meta autogen: _new_zeros_with_same_feature_meta.out # This function compares the storage numel of self with that of other, where -# storage numel is cumputed as: `other.storage().nbytes() / other.itemsize()`. +# storage numel is computed as: `other.storage().nbytes() / other.itemsize()`. # We create this function for composite compliance purposes. The batching rule # always returns true because vmapped as_strided does not support accessing # storage locations not indexable by the input tensor. # See the note above for more information. - func: _has_same_storage_numel(Tensor self, Tensor other) -> bool @@ -173,16 +173,28 @@ - func: _assert_async.msg(Tensor self, str assert_msg) -> () dispatch: CPU: _assert_async_msg_cpu CUDA: _assert_async_msg_cuda +- func: _assert_scalar(Scalar self, str assert_msg) -> () + dispatch: + CompositeExplicitAutograd: _assert_scalar + +- func: _functional_assert_scalar(Scalar self, str assert_msg, Tensor dep_token) -> Tensor + dispatch: + CompositeExplicitAutograd: _functional_assert_scalar + - func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor dispatch: CPU: _functional_assert_async_msg_cpu - func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> () +- func: _print(str s) -> () + dispatch: + CompositeExplicitAutograd: _print + - func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> () dispatch: CompositeExplicitAutograd: sym_constrain_range - func: sym_constrain_range_for_size(Scalar size, *, int? min=None, int? max=None) -> () @@ -468,10 +480,11 @@ tags: pointwise - func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: conj_physical_out + MPS: conj_physical_out_mps SparseCPU, SparseCUDA: conj_physical_out_sparse SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out tags: pointwise - func: conj_physical_(Tensor(a!) self) -> Tensor(a!) @@ -562,12 +575,12 @@ Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf) ScalarOnly: add (Bool) dispatch: SparseCPU: add_out_sparse_cpu SparseCUDA: add_out_sparse_cuda - SparseCsrCPU: add_out_sparse_csr_cpu - SparseCsrCUDA: add_out_sparse_csr_cuda + SparseCsrCPU: add_out_sparse_compressed_cpu + SparseCsrCUDA: add_out_sparse_compressed_cuda MkldnnCPU: mkldnn_add_out MPS: add_out_mps tags: pointwise - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor @@ -761,11 +774,11 @@ - func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CompositeExplicitAutograd: arange -# This operator should be named `aragne.start_out` if following the naming convention. However that +# This operator should be named `arange.start_out` if following the naming convention. However that # name is already taken. Disabled because of CI job failures. # FIXME: enable this #- func: arange.start_out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!) # dispatch: # CompositeExplicitAutograd: arange_start_out @@ -1218,10 +1231,17 @@ - func: copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) dispatch: CompositeExplicitAutograd: copysign_out tags: pointwise +- func: _lazy_clone(Tensor self) -> Tensor + # Like clone, but the copy takes place lazily, only if either the + # input or the output are written. + variants: function, method + dispatch: + CompositeExplicitAutograd: _lazy_clone + - func: logical_not(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: logical_not @@ -1619,10 +1639,11 @@ CompositeExplicitAutograd: complex - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: complex_out + MPS: complex_out_mps - func: polar(Tensor abs, Tensor angle) -> Tensor variants: function dispatch: CompositeExplicitAutograd: polar @@ -1845,12 +1866,15 @@ autogen: cudnn_batch_norm_backward.out - func: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor dispatch: CUDA: cudnn_convolution - autogen: cudnn_convolution.out +- func: cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CUDA: cudnn_convolution_out + - func: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor dispatch: CUDA: cudnn_convolution_transpose autogen: cudnn_convolution_transpose.out @@ -2344,11 +2368,11 @@ CUDA: empty_cuda MPS: empty_mps Meta: empty_meta_symint MkldnnCPU: empty_mkldnn SparseCPU, SparseCUDA, SparseMeta: empty_sparse - SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_sparse_compressed QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized tags: core - func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: @@ -2450,11 +2474,11 @@ device_guard: False dispatch: CompositeExplicitAutograd: empty_like QuantizedCPU, QuantizedCUDA: empty_like_quantized SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo - SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr NestedTensorCPU, NestedTensorCUDA: empty_like_nested autogen: empty_like.out - func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: @@ -2952,42 +2976,48 @@ - func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor variants: function dispatch: CPU: _fft_r2c_mkl CUDA: _fft_r2c_cufft + MPS: _fft_r2c_mps - func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: CPU: _fft_r2c_mkl_out CUDA: _fft_r2c_cufft_out + MPS: _fft_r2c_mps_out # Complex to real inverse FFT - func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor variants: function dispatch: CPU: _fft_c2r_mkl CUDA: _fft_c2r_cufft + MPS: _fft_c2r_mps - func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: CPU: _fft_c2r_mkl_out CUDA: _fft_c2r_cufft_out + MPS: _fft_c2r_mps_out # Standard complex to complex FFT (forward or backward) - func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor variants: function dispatch: CPU: _fft_c2c_mkl CUDA: _fft_c2c_cufft + MPS: _fft_c2c_mps - func: _fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: CPU: _fft_c2c_mkl_out CUDA: _fft_c2c_cufft_out + MPS: _fft_c2c_mps_out - func: _validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () device_check: NoCheck variants: function dispatch: @@ -3300,16 +3330,20 @@ - func: _cslt_compress(Tensor input) -> Tensor dispatch: CUDA: _cslt_compress -- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> Tensor +- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor dispatch: CUDA: _cslt_sparse_mm -- func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None) -> Tensor +- func: _cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int dispatch: + CUDA: _cslt_sparse_mm_search + +- func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor + dispatch: CUDA: _sparse_semi_structured_linear - func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor dispatch: CUDA: _mixed_dtypes_linear @@ -4056,16 +4090,22 @@ dispatch: CUDA: _int_mm_out_cuda - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor dispatch: + CPU: _convert_weight_to_int4pack_cpu CUDA: _convert_weight_to_int4pack_cuda - func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor dispatch: + CPU: _weight_int4pack_mm_cpu CUDA: _weight_int4pack_mm_cuda +- func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor + dispatch: + CPU: _weight_int8pack_mm_cpu + - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor python_module: sparse - func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor python_module: sparse @@ -4437,11 +4477,10 @@ dispatch: CPU: pixel_shuffle_cpu MPS: pixel_shuffle_mps CompositeExplicitAutogradNonFunctional: math_pixel_shuffle autogen: pixel_shuffle.out - tags: core - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor dispatch: CPU: pixel_unshuffle_cpu MPS: pixel_unshuffle_mps @@ -4808,11 +4847,11 @@ variants: function, method device_check: NoCheck device_guard: False dispatch: CompositeImplicitAutograd: reshape_symint - CompositeImplicitAutogradNestedTensor: reshape_nested + CompositeImplicitAutogradNestedTensor: reshape_nested_symint - func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor variants: function dispatch: CompositeExplicitAutograd: _reshape_copy_symint @@ -4967,10 +5006,11 @@ - func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!) structured_delegate: gelu.out device_check: NoCheck # TensorIterator python_module: nn dispatch: + QuantizedCPU: gelu_quantized_cpu_ NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_ - func: gelu(Tensor self, *, str approximate='none') -> Tensor structured_delegate: gelu.out device_check: NoCheck # TensorIterator @@ -5354,18 +5394,33 @@ device_guard: False dispatch: CompositeExplicitAutograd: slice_backward autogen: slice_backward.out +# NB: This op exists to back the implementation of reverse view_funcs for various views (chunk, +# slice.Tensor, split_with_sizes, et. al.). Currently, these are only used during fake-ification +# of PT2 graph input subclass instances that are views. This means: +# * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it) +# * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it) +# * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph +# input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is +# easier to implement for a subclass than as_strided() +- func: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: slice_inverse_symint + - func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor variants: function, method device_check: NoCheck device_guard: False dispatch: CompositeExplicitAutogradNonFunctional: slice_scatter autogen: slice_scatter.out - tags: core + tags: [core, view_copy] - func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor variants: function, method device_check: NoCheck device_guard: False @@ -5560,10 +5615,18 @@ CPU: _sspaddmm_out_only_sparse CUDA: _sspaddmm_out_only_sparse_cuda SparseCPU: _sspaddmm_out_cpu SparseCUDA: _sspaddmm_out_cuda +- func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor + dispatch: + CompositeExplicitAutograd: _chunk_cat + +- func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: _chunk_cat_out + - func: stack(Tensor[] tensors, int dim=0) -> Tensor dispatch: CompositeExplicitAutograd: stack - func: stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) @@ -5751,10 +5814,11 @@ - func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function dispatch: CPU, CUDA: std_mean + MPS: std_mean_mps autogen: std_mean.correction_out - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function @@ -6006,11 +6070,10 @@ variants: function, method dispatch: CPU, MPS: roll CUDA: roll_cuda autogen: roll.out - tags: core # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor variants: function, method @@ -6089,10 +6152,56 @@ tags: view_copy dispatch: CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy autogen: _nested_view_from_buffer_copy.out +- func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a) + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor + variants: function + device_check: NoCheck + tags: view_copy + dispatch: + CompositeExplicitAutogradNonFunctional: _nested_view_from_jagged_copy + autogen: _nested_view_from_jagged_copy.out + +- func: _nested_get_values(Tensor(a) self) -> Tensor(a) + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_get_values_copy(Tensor self) -> Tensor + variants: function + device_check: NoCheck + tags: view_copy + dispatch: + CompositeExplicitAutogradNonFunctional: _nested_get_values_copy + autogen: _nested_get_values_copy.out + +- func: _nested_get_offsets(Tensor self) -> Tensor + variants: function + device_check: NoCheck + dispatch: {} + +# returns undefined Tensor if no lengths present +- func: _nested_get_lengths(Tensor self) -> Tensor + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_get_ragged_idx(Tensor self) -> int + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_get_jagged_dummy(Tensor any) -> Tensor + category_override: dummy + dispatch: {} + - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor dispatch: # calls unsqueeze CompositeExplicitAutogradNonFunctional: _trilinear autogen: _trilinear.out @@ -6273,10 +6382,11 @@ - func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function dispatch: CPU, CUDA: var_mean + MPS: var_mean_mps autogen: var_mean.correction_out - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function @@ -6293,19 +6403,17 @@ - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA: where - MPS: where_mps + CPU, CUDA, MPS: where tags: [core, pointwise] - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: where_self_out - MPS: where_self_out_mps + CPU, CUDA, MPS: where_self_out - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor variants: function - func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor @@ -6642,11 +6750,11 @@ dispatch: CPU, CUDA: zero_ MPS: zero_mps_ Meta: zero_meta_ SparseCPU, SparseCUDA, SparseMeta: zero_sparse_ - SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_ MkldnnCPU: mkldnn_zero_ NestedTensorCPU, NestedTensorCUDA: zero_nested_ autogen: zero, zero.out - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -6932,11 +7040,11 @@ # shared. In other words, their outputs are non-differentiable views of the # sparse tensor. # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given # the default would never make sense. -- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor dispatch: CompositeExplicitAutograd: sparse_compressed_tensor - func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor @@ -6949,11 +7057,14 @@ - func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor -- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + dispatch: + CompositeImplicitAutograd: _sparse_compressed_tensor_unsafe_symint + - func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -7035,11 +7146,11 @@ - func: sparse_dim(Tensor self) -> int variants: method dispatch: CPU, CUDA: sparse_dim_strided SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse - SparseCsrCPU, SparseCsrCUDA: sparse_dim_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr device_check: NoCheck device_guard: False # legacy method - func: _dimI(Tensor self) -> int @@ -7052,11 +7163,11 @@ - func: dense_dim(Tensor self) -> int variants: method dispatch: CPU, CUDA: dense_dim_strided SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse - SparseCsrCPU, SparseCsrCUDA: dense_dim_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr device_check: NoCheck device_guard: False # legacy method - func: _dimV(Tensor self) -> int @@ -7068,11 +7179,11 @@ - func: _nnz(Tensor self) -> int variants: method dispatch: SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse - SparseCsrCPU, SparseCsrCUDA: _nnz_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _nnz_sparse_csr device_check: NoCheck device_guard: False # NOTE: [ coalesce autograd ] # coalesce returns self directly for already coalesced sparse tensors. @@ -7131,44 +7242,44 @@ - func: values(Tensor(a) self) -> Tensor(a) variants: method dispatch: SparseCPU, SparseCUDA, SparseMeta: values_sparse - SparseCsrCPU, SparseCsrCUDA: values_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr NestedTensorCPU, NestedTensorCUDA: values_nested CompositeExplicitAutograd: values_default device_check: NoCheck device_guard: False - func: crow_indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCsrCPU, SparseCsrCUDA: crow_indices_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: crow_indices_sparse_csr CompositeExplicitAutograd: crow_indices_default device_check: NoCheck device_guard: False - func: col_indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCsrCPU, SparseCsrCUDA: col_indices_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: col_indices_sparse_csr CompositeExplicitAutograd: col_indices_default device_check: NoCheck device_guard: False - func: ccol_indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ccol_indices_sparse_csr CompositeExplicitAutograd: ccol_indices_default device_check: NoCheck device_guard: False - func: row_indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: row_indices_sparse_csr CompositeExplicitAutograd: row_indices_default device_check: NoCheck device_guard: False - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) @@ -7673,10 +7784,11 @@ device_check: NoCheck device_guard: False dispatch: CPU, CUDA, Meta, MPS: set_ autogen: set.source_Storage, set.source_Storage_out + tags: inplace_view - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False @@ -7685,34 +7797,38 @@ Meta: set_storage_meta__symint CUDA: set_storage_cuda_ MPS: set_storage_mps_ QuantizedCPU, QuantizedCUDA: set_storage_quantized_ autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out + tags: inplace_view - func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False dispatch: CompositeImplicitAutograd: set__symint + tags: inplace_view - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False dispatch: CPU, CUDA, Meta, MPS: set_tensor_ autogen: set.source_Tensor, set.source_Tensor_out + tags: inplace_view - func: set_(Tensor(a!) self) -> Tensor(a!) variants: method dispatch: CPU: set_cpu_ CUDA: set_cuda_ Meta: set_meta_ MPS: set_mps_ autogen: set, set.out + tags: inplace_view # Not making it CompositeImplicitAutograd because lift # should be a primitive w.r.t. functorch # TODO: this should have a view annotation @@ -10110,16 +10226,18 @@ - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> () variants: function dispatch: CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_ + CPU: _amp_foreach_non_finite_check_and_unscale_cpu_ autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!) variants: function dispatch: CUDA: _amp_update_scale_cuda_ + CPU: _amp_update_scale_cpu_ autogen: _amp_update_scale, _amp_update_scale.out #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor #dispatch: #CPU: _cat_cpu @@ -12339,10 +12457,11 @@ python_module: nn structured: True dispatch: CPU: upsample_linear1d_out_cpu CUDA: upsample_linear1d_out_cuda + MPS: upsample_linear1d_out_mps - func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor python_module: nn structured_delegate: upsample_linear1d.out @@ -12350,10 +12469,11 @@ python_module: nn structured: True dispatch: CPU: upsample_linear1d_backward_out_cpu CUDA: upsample_linear1d_backward_out_cuda + MPS: upsample_linear1d_backward_out_mps - func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor python_module: nn structured_delegate: upsample_linear1d_backward.grad_input @@ -12822,11 +12942,11 @@ CompositeExplicitAutograd: isinf SparseCPU, SparseCUDA: isinf_sparse SparseMeta: isinf_sparse_meta SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr autogen: isinf.out - tags: core + tags: [core, pointwise] - func: record_stream(Tensor(a!) self, Stream s) -> () variants: method dispatch: CUDA: record_stream_cuda @@ -13748,15 +13868,22 @@ - func: linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) python_module: linalg dispatch: CPU, CUDA: linalg_eig_out +- func: _linalg_eigvals(Tensor self) -> Tensor + python_module: linalg + dispatch: + CPU, CUDA: _linalg_eigvals + - func: linalg_eigvals(Tensor self) -> Tensor python_module: linalg - func: linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: linalg + dispatch: + CPU, CUDA: linalg_eigvals_out # This function is exposes the `compute_v` flag, which is then used to implement `linalg.eigh` and # `linalg.eigvalsh` as composite functions that call this one - func: _linalg_eigh(Tensor A, str UPLO="L", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors) structured_delegate: _linalg_eigh.eigenvalues @@ -14056,10 +14183,16 @@ ## Functions that are only for testing # It is undocumented and should not be used outside of tests. - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor +# Note: for testing COW materialization within `at::parallel_for` loop function +- func: _test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _test_parallel_materialize + # Note: this function is only for testing. - func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor python_module: nn dispatch: CPU: _test_optional_intlist @@ -14390,10 +14523,11 @@ - func: split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> () variants: function dispatch: CompositeExplicitAutograd: split_with_sizes_copy_out + CUDA: split_with_sizes_copy_out_cuda - func: view_copy(Tensor self, SymInt[] size) -> Tensor variants: function dispatch: CompositeExplicitAutogradNonFunctional: view_copy_symint @@ -14466,23 +14600,32 @@ variants: function tags: nondeterministic_seeded - func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) dispatch: - CPU: _scaled_dot_product_flash_attention_cpu CUDA: _scaled_dot_product_flash_attention_cuda NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda tags: nondeterministic_seeded +- func: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp) + dispatch: + CPU: _scaled_dot_product_flash_attention_cpu + tags: nondeterministic_seeded + - func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value) device_check: NoCheck variants: function dispatch: - CPU: _scaled_dot_product_flash_attention_backward_cpu CUDA: _scaled_dot_product_flash_attention_backward_cuda NestedTensorCUDA: _scaled_dot_product_flash_attention_backward_nested +- func: _scaled_dot_product_flash_attention_for_cpu_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, float dropout_p, bool is_causal, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value) + device_check: NoCheck + variants: function + dispatch: + CPU: _scaled_dot_product_flash_attention_cpu_backward + - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset) dispatch: CUDA: _scaled_dot_product_efficient_attention_cuda NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda tags: nondeterministic_seeded @@ -14491,10 +14634,15 @@ device_check: NoCheck dispatch: CUDA: _scaled_dot_product_efficient_attention_backward_cuda tags: nondeterministic_seeded +- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset) + dispatch: + CUDA: _scaled_dot_product_cudnn_attention_cuda + tags: nondeterministic_seeded + - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) variants: function dispatch: CUDA: _flash_attention_forward tags: nondeterministic_seeded @@ -14503,12 +14651,12 @@ device_check: NoCheck variants: function dispatch: CUDA: _flash_attention_backward -# Returns ouput, logsumexp if compute_logsumexp -- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k) +# Returns output, logsumexp if compute_logsumexp +- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k) variants: function dispatch: CUDA: _efficient_attention_forward tags: nondeterministic_seeded @@ -15342,9 +15490,25 @@ device_check: NoCheck variants: function dispatch: CUDA: _fused_adamw_kernel_cuda_ autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out + +- func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). + variants: function + dispatch: + CUDA: _fused_sgd_kernel_cuda_ + autogen: _fused_sgd, _fused_sgd.out + +- func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). + # but still skip the device check as the Tensor LR can be on CPU + device_check: NoCheck + variants: function + dispatch: + CUDA: _fused_sgd_kernel_cuda_ + autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts. - func: _propagate_xla_data(Tensor input, Tensor output) -> () variants: function