codegen/native_functions.yaml in torch-rb-0.16.0 vs codegen/native_functions.yaml in torch-rb-0.17.0
- old
+ new
@@ -547,24 +547,24 @@
- func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
device_check: NoCheck # TensorIterator
structured_delegate: add.out
variants: function, method
dispatch:
- SparseCPU, SparseCUDA: add_sparse
- SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
+ SparseCPU, SparseCUDA, SparseMeta: add_sparse
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
MkldnnCPU: mkldnn_add
ZeroTensor: add_zerotensor
NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
tags: [core, pointwise]
- func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: method
structured_delegate: add.out
dispatch:
- SparseCPU, SparseCUDA: add_sparse_
- SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
+ SparseCPU, SparseCUDA, SparseMeta: add_sparse_
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
MkldnnCPU: mkldnn_add_
NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
tags: pointwise
- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -573,13 +573,13 @@
structured_inherits: TensorIteratorBase
ufunc_inner_loop:
Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
ScalarOnly: add (Bool)
dispatch:
- SparseCPU: add_out_sparse_cpu
+ SparseCPU, SparseMeta: add_out_sparse_cpu
SparseCUDA: add_out_sparse_cuda
- SparseCsrCPU: add_out_sparse_compressed_cpu
+ SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
SparseCsrCUDA: add_out_sparse_compressed_cuda
MkldnnCPU: mkldnn_add_out
MPS: add_out_mps
tags: pointwise
@@ -1748,10 +1748,11 @@
CompositeImplicitAutograd: conv_transpose3d_symint
- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
variants: function
dispatch:
+ Meta: copy_meta
CompositeExplicitAutogradNonFunctional: copy
tags: core
- func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
variants: method
@@ -3125,10 +3126,11 @@
- func: isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
variants: function
structured: True
dispatch:
CPU, CUDA: isin_Tensor_Tensor_out
+ MPS: isin_Tensor_Tensor_out_mps
- func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
variants: function
structured_delegate: isin.Tensor_Tensor_out
@@ -3266,10 +3268,12 @@
MPS: layer_norm_backward_mps
NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
autogen: native_layer_norm_backward.out
tags: core
+- func: rms_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor
+
- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
variants: function, method
dispatch:
CompositeExplicitAutograd: nan_to_num
SparseCPU, SparseCUDA: nan_to_num_sparse
@@ -3338,14 +3342,35 @@
- func: _cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int
dispatch:
CUDA: _cslt_sparse_mm_search
+- func: _sparse_semi_structured_tile(Tensor input, str algorithm="", bool use_cutlass=True) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+ dispatch:
+ CUDA: _sparse_semi_structured_tile
+
+- func: _sparse_semi_structured_apply(Tensor input, Tensor thread_masks) -> (Tensor, Tensor)
+ dispatch:
+ CUDA: _sparse_semi_structured_apply
+
+- func: _sparse_semi_structured_apply_dense(Tensor input, Tensor thread_masks) -> Tensor
+ dispatch:
+ CUDA: _sparse_semi_structured_apply_dense
+
+# DEPRECATED: Use torch.__sparse_semi_structured_mm/torch._sparse_semi_structured_addmm instead
- func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor
dispatch:
CUDA: _sparse_semi_structured_linear
+- func: _sparse_semi_structured_mm(Tensor mat1, Tensor mat1_meta, Tensor mat2, *, ScalarType? out_dtype=None) -> Tensor
+ dispatch:
+ CUDA: _sparse_semi_structured_mm
+
+- func: _sparse_semi_structured_addmm(Tensor input, Tensor mat1, Tensor mat1_meta, Tensor mat2, *, Scalar alpha=1, Scalar beta=1, ScalarType? out_dtype=None) -> Tensor
+ dispatch:
+ CUDA: _sparse_semi_structured_addmm
+
- func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor
dispatch:
CUDA: _mixed_dtypes_linear
- func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
@@ -4082,29 +4107,33 @@
SparseCPU, SparseCUDA: _sparse_mm_out
SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
- func: _int_mm(Tensor self, Tensor mat2) -> Tensor
dispatch:
+ CPU: _int_mm_cpu
CUDA: _int_mm_cuda
- func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
+ CPU: _int_mm_out_cpu
CUDA: _int_mm_out_cuda
- func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
dispatch:
CPU: _convert_weight_to_int4pack_cpu
CUDA: _convert_weight_to_int4pack_cuda
- func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
dispatch:
CPU: _weight_int4pack_mm_cpu
+ MPS: _weight_int4pack_mm_mps
CUDA: _weight_int4pack_mm_cuda
- func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
dispatch:
CPU: _weight_int8pack_mm_cpu
+ MPS: _weight_int8pack_mm_mps
- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
python_module: sparse
- func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor
@@ -5395,11 +5424,11 @@
dispatch:
CompositeExplicitAutograd: slice_backward
autogen: slice_backward.out
# NB: This op exists to back the implementation of reverse view_funcs for various views (chunk,
-# slice.Tensor, split_with_sizes, et. al.). Currently, these are only used during fake-ification
+# slice.Tensor, split_with_sizes, et al.). Currently, these are only used during fake-ification
# of PT2 graph input subclass instances that are views. This means:
# * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it)
# * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it)
# * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph
# input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is
@@ -5618,14 +5647,16 @@
SparseCUDA: _sspaddmm_out_cuda
- func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor
dispatch:
CompositeExplicitAutograd: _chunk_cat
+ CUDA: _chunk_cat_cuda
- func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CompositeExplicitAutograd: _chunk_cat_out
+ CUDA: _chunk_cat_out_cuda
- func: stack(Tensor[] tensors, int dim=0) -> Tensor
dispatch:
CompositeExplicitAutograd: stack
@@ -5687,12 +5718,12 @@
- func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: sum
- SparseCPU, SparseCUDA: sum_coo
- SparseCsrCPU, SparseCsrCUDA: sum_csr
+ SparseCPU, SparseCUDA, SparseMeta: sum_coo
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
autogen: sum.out
- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
# TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
structured_delegate: sum.IntList_out
@@ -6198,10 +6229,16 @@
- func: _nested_get_jagged_dummy(Tensor any) -> Tensor
category_override: dummy
dispatch: {}
+- func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor)
+ variants: function
+ device_check: NoCheck
+ dispatch:
+ CPU, CUDA: _nested_compute_contiguous_strides_offsets
+
- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
dispatch:
# calls unsqueeze
CompositeExplicitAutogradNonFunctional: _trilinear
autogen: _trilinear.out
@@ -6463,11 +6500,11 @@
- func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
CPU: _efficientzerotensor
CUDA: _efficientzerotensor_cuda
MPS: _efficientzerotensor_mps
- Meta: _efficientzerotensor_meta
+ Meta: _efficientzerotensor_meta_symint
autogen: _efficientzerotensor.out
- func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
CompositeExplicitAutograd: zeros_symint
@@ -6540,10 +6577,36 @@
- func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
dispatch:
SparseCPU, SparseCUDA: norm_sparse
autogen: native_norm.ScalarOpt_dim_dtype_out
+- func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
+ dispatch:
+ CPU: _batch_norm_with_update_cpu
+ CUDA: _batch_norm_with_update_cuda
+ MPS: _batch_norm_with_update_mps
+ MkldnnCPU: _batch_norm_with_update_mkldnn
+ autogen: _batch_norm_with_update_functional
+
+- func: _batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))
+ dispatch:
+ CPU: _batch_norm_with_update_cpu_out
+ CUDA: _batch_norm_with_update_cuda_out
+ MPS: _batch_norm_with_update_mps_out
+
+- func: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
+ dispatch:
+ CompositeExplicitAutograd: _batch_norm_no_update
+ autogen: _batch_norm_no_update.out
+
+- func: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor)
+ dispatch:
+ CPU: _new_batch_norm_backward_cpu
+ CUDA: _new_batch_norm_backward_cuda
+ MPS: _new_batch_norm_backward_mps
+ MkldnnCPU: _new_batch_norm_backward_mkldnn
+
# TODO: reduce signatures down to one when optional args is available
- func: _sparse_sum(Tensor self) -> Tensor
- func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor
@@ -7040,10 +7103,14 @@
# shared. In other words, their outputs are non-differentiable views of the
# sparse tensor.
# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
# the default would never make sense.
+- func: _sparse_compressed_tensor_with_dims(int nnz, int dense_dim, int[] size, int[] blocksize, ScalarType index_dtype, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+ dispatch:
+ CompositeExplicitAutograd: sparse_compressed_tensor_with_dims
+
- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
dispatch:
CompositeExplicitAutograd: sparse_compressed_tensor
- func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
@@ -7144,13 +7211,13 @@
- func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor
- func: sparse_dim(Tensor self) -> int
variants: method
dispatch:
- CPU, CUDA: sparse_dim_strided
SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr
+ CompositeExplicitAutograd: sparse_dim_default
device_check: NoCheck
device_guard: False
# legacy method
- func: _dimI(Tensor self) -> int
@@ -7161,13 +7228,13 @@
device_guard: False
- func: dense_dim(Tensor self) -> int
variants: method
dispatch:
- CPU, CUDA: dense_dim_strided
SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
+ CompositeExplicitAutograd: dense_dim_default
device_check: NoCheck
device_guard: False
# legacy method
- func: _dimV(Tensor self) -> int
@@ -7294,11 +7361,11 @@
- func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
device_check: NoCheck # Allows copy into different device
variants: function
dispatch:
- SparseCPU, SparseCUDA: copy_sparse_
+ SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
# By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
- func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
variants: function, method
@@ -7397,11 +7464,11 @@
python_module: nn
dispatch:
MkldnnCPU: mkldnn_reorder_conv2d_weight
autogen: mkldnn_reorder_conv2d_weight.out
-- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
variants: function
python_module: nn
dispatch:
MkldnnCPU: mkldnn_reorder_conv3d_weight
autogen: mkldnn_reorder_conv3d_weight.out
@@ -7645,11 +7712,11 @@
- func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType
variants: function
- func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
-- func: can_cast(ScalarType from, ScalarType to) -> bool
+- func: can_cast(ScalarType from_, ScalarType to) -> bool
variants: function
- func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
variants: function
@@ -10220,10 +10287,11 @@
- func: alias(Tensor(a) self) -> Tensor(a)
variants: method, function
dispatch:
CompositeExplicitAutograd: alias
+ NestedTensorCPU, NestedTensorCUDA: alias_nested
tags: core
- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
variants: function
dispatch:
@@ -10253,1047 +10321,1061 @@
- func: _foreach_add.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_scalar_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow
CUDA: foreach_tensor_add_scalar_kernel_cuda
- func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_scalar_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow_
CUDA: foreach_tensor_add_scalar_kernel_cuda_
autogen: _foreach_add.Scalar_out
- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_list_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow
CUDA: foreach_tensor_add_list_kernel_cuda
- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow_
CUDA: foreach_tensor_add_list_kernel_cuda_
autogen: _foreach_add.List_out
- func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_scalarlist_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow
CUDA: foreach_tensor_add_scalarlist_kernel_cuda
- func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_scalarlist_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow_
CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
autogen: _foreach_add.ScalarList_out
- func: _foreach_add.Tensor(Tensor[] self, Tensor other, *, Scalar alpha=1) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_tensor_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow
CUDA: foreach_tensor_add_tensor_kernel_cuda
- func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_add_tensor_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow_
CUDA: foreach_tensor_add_tensor_kernel_cuda_
autogen: _foreach_add.Tensor_out
- func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_scalar_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow
CUDA: foreach_tensor_sub_scalar_kernel_cuda
- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_scalar_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow_
CUDA: foreach_tensor_sub_scalar_kernel_cuda_
autogen: _foreach_sub.Scalar_out
- func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_list_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow
CUDA: foreach_tensor_sub_list_kernel_cuda
- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow_
CUDA: foreach_tensor_sub_list_kernel_cuda_
autogen: _foreach_sub.List_out
- func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_scalarlist_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow
CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sub_scalarlist_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow_
CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
autogen: _foreach_sub.ScalarList_out
- func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_scalar_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow
CUDA: foreach_tensor_mul_scalar_kernel_cuda
- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_scalar_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow_
CUDA: foreach_tensor_mul_scalar_kernel_cuda_
autogen: _foreach_mul.Scalar_out
- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_list_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow
CUDA: foreach_tensor_mul_list_kernel_cuda
- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow_
CUDA: foreach_tensor_mul_list_kernel_cuda_
autogen: _foreach_mul.List_out
- func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_scalarlist_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow
CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_scalarlist_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow_
CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
autogen: _foreach_mul.ScalarList_out
- func: _foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_tensor_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow
CUDA: foreach_tensor_mul_tensor_kernel_cuda
- func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_mul_tensor_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow_
CUDA: foreach_tensor_mul_tensor_kernel_cuda_
autogen: _foreach_mul.Tensor_out
- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_scalar_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow
CUDA: foreach_tensor_div_scalar_kernel_cuda
- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_scalar_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow_
CUDA: foreach_tensor_div_scalar_kernel_cuda_
autogen: _foreach_div.Scalar_out
- func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_list_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
CUDA: foreach_tensor_div_list_kernel_cuda
- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
CUDA: foreach_tensor_div_list_kernel_cuda_
autogen: _foreach_div.List_out
- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_scalarlist_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow
CUDA: foreach_tensor_div_scalarlist_kernel_cuda
- func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_scalarlist_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow_
CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
autogen: _foreach_div.ScalarList_out
- func: _foreach_div.Tensor(Tensor[] self, Tensor other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_tensor_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
CUDA: foreach_tensor_div_tensor_kernel_cuda
- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_div_tensor_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
CUDA: foreach_tensor_div_tensor_kernel_cuda_
autogen: _foreach_div.Tensor_out
- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow
CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
- func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_
CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
autogen: _foreach_clamp_max.Scalar_out
- func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_list_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow
CUDA: foreach_tensor_clamp_max_list_kernel_cuda
- func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_
CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
autogen: _foreach_clamp_max.List_out
- func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow
CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
- func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_
CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
autogen: _foreach_clamp_max.ScalarList_out
- func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow
CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
- func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
autogen: _foreach_clamp_min.Scalar_out
- func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_list_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow
CUDA: foreach_tensor_clamp_min_list_kernel_cuda
- func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_
CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
autogen: _foreach_clamp_min.List_out
- func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow
CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
- func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_
CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
autogen: _foreach_clamp_min.ScalarList_out
# foreach_minimum/maximum dispatches to clamp_max/min
- func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow
CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
- func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
autogen: _foreach_maximum.Scalar_out
# foreach_minimum/maximum dispatches to clamp_max/min
- func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_list_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow
CUDA: foreach_tensor_clamp_min_list_kernel_cuda
- func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_
CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
autogen: _foreach_maximum.List_out
# foreach_minimum/maximum dispatches to clamp_max/min
- func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow
CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
- func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_
CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
autogen: _foreach_maximum.ScalarList_out
- func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow
CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
- func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_
CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
autogen: _foreach_minimum.Scalar_out
- func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_list_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow
CUDA: foreach_tensor_clamp_max_list_kernel_cuda
- func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_
CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
autogen: _foreach_minimum.List_out
- func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow
CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
- func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_
CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
autogen: _foreach_minimum.ScalarList_out
- func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_scalar_slow
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow
CUDA: foreach_tensor_addcdiv_scalar_cuda
- func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_scalarlist_slow
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow
CUDA: foreach_tensor_addcdiv_scalarlist_cuda
- func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_tensor_slow
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow
CUDA: foreach_tensor_addcdiv_tensor_cuda
- func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_scalar_slow_
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow_
CUDA: foreach_tensor_addcdiv_scalar_cuda_
autogen: _foreach_addcdiv.Scalar_out
- func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_scalarlist_slow_
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow_
CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
autogen: _foreach_addcdiv.ScalarList_out
- func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcdiv_tensor_slow_
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow_
CUDA: foreach_tensor_addcdiv_tensor_cuda_
autogen: _foreach_addcdiv.Tensor_out
- func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_scalar_slow
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow
CUDA: foreach_tensor_addcmul_scalar_cuda
- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_scalarlist_slow
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow
CUDA: foreach_tensor_addcmul_scalarlist_cuda
- func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_tensor_slow
+ CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow
CUDA: foreach_tensor_addcmul_tensor_cuda
- func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_scalar_slow_
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow_
CUDA: foreach_tensor_addcmul_scalar_cuda_
autogen: _foreach_addcmul.Scalar_out
- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_scalarlist_slow_
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow_
CUDA: foreach_tensor_addcmul_scalarlist_cuda_
autogen: _foreach_addcmul.ScalarList_out
- func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_addcmul_tensor_slow_
+ CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow_
CUDA: foreach_tensor_addcmul_tensor_cuda_
autogen: _foreach_addcmul.Tensor_out
- func: _foreach_abs(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_abs_slow
+ CompositeExplicitAutograd: foreach_tensor_abs_slow
CUDA: foreach_tensor_abs_cuda
- func: _foreach_abs_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_abs_slow_
+ CompositeExplicitAutograd: foreach_tensor_abs_slow_
CUDA: foreach_tensor_abs_cuda_
autogen: _foreach_abs.out
- func: _foreach_acos(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_acos_slow
+ CompositeExplicitAutograd: foreach_tensor_acos_slow
CUDA: foreach_tensor_acos_cuda
- func: _foreach_acos_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_acos_slow_
+ CompositeExplicitAutograd: foreach_tensor_acos_slow_
CUDA: foreach_tensor_acos_cuda_
autogen: _foreach_acos.out
- func: _foreach_asin(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_asin_slow
+ CompositeExplicitAutograd: foreach_tensor_asin_slow
CUDA: foreach_tensor_asin_cuda
- func: _foreach_asin_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_asin_slow_
+ CompositeExplicitAutograd: foreach_tensor_asin_slow_
CUDA: foreach_tensor_asin_cuda_
autogen: _foreach_asin.out
- func: _foreach_atan(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_atan_slow
+ CompositeExplicitAutograd: foreach_tensor_atan_slow
CUDA: foreach_tensor_atan_cuda
- func: _foreach_atan_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_atan_slow_
+ CompositeExplicitAutograd: foreach_tensor_atan_slow_
CUDA: foreach_tensor_atan_cuda_
autogen: _foreach_atan.out
- func: _foreach_ceil(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_ceil_slow
+ CompositeExplicitAutograd: foreach_tensor_ceil_slow
CUDA: foreach_tensor_ceil_cuda
- func: _foreach_ceil_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_ceil_slow_
+ CompositeExplicitAutograd: foreach_tensor_ceil_slow_
CUDA: foreach_tensor_ceil_cuda_
autogen: _foreach_ceil.out
- func: _foreach_cos(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_cos_slow
+ CompositeExplicitAutograd: foreach_tensor_cos_slow
CUDA: foreach_tensor_cos_cuda
- func: _foreach_cos_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_cos_slow_
+ CompositeExplicitAutograd: foreach_tensor_cos_slow_
CUDA: foreach_tensor_cos_cuda_
autogen: _foreach_cos.out
- func: _foreach_cosh(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_cosh_slow
+ CompositeExplicitAutograd: foreach_tensor_cosh_slow
CUDA: foreach_tensor_cosh_cuda
- func: _foreach_cosh_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_cosh_slow_
+ CompositeExplicitAutograd: foreach_tensor_cosh_slow_
CUDA: foreach_tensor_cosh_cuda_
autogen: _foreach_cosh.out
- func: _foreach_erf(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_erf_slow
+ CompositeExplicitAutograd: foreach_tensor_erf_slow
CUDA: foreach_tensor_erf_cuda
- func: _foreach_erf_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_erf_slow_
+ CompositeExplicitAutograd: foreach_tensor_erf_slow_
CUDA: foreach_tensor_erf_cuda_
autogen: _foreach_erf.out
- func: _foreach_erfc(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_erfc_slow
+ CompositeExplicitAutograd: foreach_tensor_erfc_slow
CUDA: foreach_tensor_erfc_cuda
- func: _foreach_erfc_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_erfc_slow_
+ CompositeExplicitAutograd: foreach_tensor_erfc_slow_
CUDA: foreach_tensor_erfc_cuda_
autogen: _foreach_erfc.out
- func: _foreach_exp(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_exp_slow
+ CompositeExplicitAutograd: foreach_tensor_exp_slow
CUDA: foreach_tensor_exp_cuda
- func: _foreach_exp_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_exp_slow_
+ CompositeExplicitAutograd: foreach_tensor_exp_slow_
CUDA: foreach_tensor_exp_cuda_
autogen: _foreach_exp.out
- func: _foreach_expm1(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_expm1_slow
+ CompositeExplicitAutograd: foreach_tensor_expm1_slow
CUDA: foreach_tensor_expm1_cuda
- func: _foreach_expm1_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_expm1_slow_
+ CompositeExplicitAutograd: foreach_tensor_expm1_slow_
CUDA: foreach_tensor_expm1_cuda_
autogen: _foreach_expm1.out
- func: _foreach_floor(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_floor_slow
+ CompositeExplicitAutograd: foreach_tensor_floor_slow
CUDA: foreach_tensor_floor_cuda
- func: _foreach_floor_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_floor_slow_
+ CompositeExplicitAutograd: foreach_tensor_floor_slow_
CUDA: foreach_tensor_floor_cuda_
autogen: _foreach_floor.out
- func: _foreach_frac(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_frac_slow
+ CompositeExplicitAutograd: foreach_tensor_frac_slow
CUDA: foreach_tensor_frac_cuda
- func: _foreach_frac_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_frac_slow_
+ CompositeExplicitAutograd: foreach_tensor_frac_slow_
CUDA: foreach_tensor_frac_cuda_
autogen: _foreach_frac.out
- func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_ternary_lerp_slow
+ CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow
CUDA: foreach_tensor_lerp_ternary_cuda
autogen: _foreach_lerp.List_out
- func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_ternary_lerp_slow_
+ CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow_
CUDA: foreach_tensor_lerp_ternary_cuda_
autogen: _foreach_lerp.List_out
- func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_lerp_list_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow
CUDA: foreach_tensor_lerp_list_cuda
autogen: _foreach_lerp.Scalar_out
- func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_lerp_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow_
CUDA: foreach_tensor_lerp_list_cuda_
autogen: _foreach_lerp.Scalar_out
- func: _foreach_lgamma(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_lgamma_slow
+ CompositeExplicitAutograd: foreach_tensor_lgamma_slow
CUDA: foreach_tensor_lgamma_cuda
- func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_lgamma_slow_
+ CompositeExplicitAutograd: foreach_tensor_lgamma_slow_
CUDA: foreach_tensor_lgamma_cuda_
autogen: _foreach_lgamma.out
- func: _foreach_log(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_log_slow
+ CompositeExplicitAutograd: foreach_tensor_log_slow
CUDA: foreach_tensor_log_cuda
- func: _foreach_log_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_log_slow_
+ CompositeExplicitAutograd: foreach_tensor_log_slow_
CUDA: foreach_tensor_log_cuda_
autogen: _foreach_log.out
- func: _foreach_log10(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_log10_slow
+ CompositeExplicitAutograd: foreach_tensor_log10_slow
CUDA: foreach_tensor_log10_cuda
- func: _foreach_log10_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_log10_slow_
+ CompositeExplicitAutograd: foreach_tensor_log10_slow_
CUDA: foreach_tensor_log10_cuda_
autogen: _foreach_log10.out
- func: _foreach_log1p(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_log1p_slow
+ CompositeExplicitAutograd: foreach_tensor_log1p_slow
CUDA: foreach_tensor_log1p_cuda
- func: _foreach_log1p_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_log1p_slow_
+ CompositeExplicitAutograd: foreach_tensor_log1p_slow_
CUDA: foreach_tensor_log1p_cuda_
autogen: _foreach_log1p.out
- func: _foreach_log2(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_log2_slow
+ CompositeExplicitAutograd: foreach_tensor_log2_slow
CUDA: foreach_tensor_log2_cuda
- func: _foreach_log2_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_log2_slow_
+ CompositeExplicitAutograd: foreach_tensor_log2_slow_
CUDA: foreach_tensor_log2_cuda_
autogen: _foreach_log2.out
+- func: _foreach_max(Tensor[] self) -> Tensor[]
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
+ variants: function
+ dispatch:
+ CompositeExplicitAutograd: foreach_tensor_max_slow
+ CUDA: foreach_tensor_max_cuda
+ autogen: _foreach_max.out
+
- func: _foreach_neg(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_neg_slow
+ CompositeExplicitAutograd: foreach_tensor_neg_slow
CUDA: foreach_tensor_neg_cuda
- func: _foreach_neg_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_neg_slow_
+ CompositeExplicitAutograd: foreach_tensor_neg_slow_
CUDA: foreach_tensor_neg_cuda_
autogen: _foreach_neg.out
-- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
+- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_norm_slow
+ CompositeExplicitAutograd: foreach_tensor_norm_slow
CUDA: foreach_tensor_norm_cuda
autogen: _foreach_norm.Scalar_out
- func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_pow_list_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow
CUDA: foreach_tensor_pow_list_kernel_cuda
- func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_pow_scalar_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow
CUDA: foreach_tensor_pow_scalar_kernel_cuda
- func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_pow_scalarlist_kernel_slow
+ CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow
CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
- func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_scalar_pow_list_kernel_slow
+ CompositeExplicitAutograd: foreach_scalar_pow_list_kernel_slow
CUDA: foreach_scalar_pow_list_kernel_cuda
- func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
device_check: NoCheck
variants: function
dispatch:
- CPU: foreach_tensor_pow_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow_
CUDA: foreach_tensor_pow_list_kernel_cuda_
autogen: _foreach_pow.List_out
- func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> ()
device_check: NoCheck
variants: function
dispatch:
- CPU: foreach_tensor_pow_scalar_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow_
CUDA: foreach_tensor_pow_scalar_kernel_cuda_
autogen: _foreach_pow.Scalar_out
- func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> ()
device_check: NoCheck
variants: function
dispatch:
- CPU: foreach_tensor_pow_scalarlist_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow_
CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
autogen: _foreach_pow.ScalarList_out
- func: _foreach_reciprocal(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_reciprocal_slow
+ CompositeExplicitAutograd: foreach_tensor_reciprocal_slow
CUDA: foreach_tensor_reciprocal_cuda
- func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_reciprocal_slow_
+ CompositeExplicitAutograd: foreach_tensor_reciprocal_slow_
CUDA: foreach_tensor_reciprocal_cuda_
autogen: _foreach_reciprocal.out
- func: _foreach_round(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_round_slow
+ CompositeExplicitAutograd: foreach_tensor_round_slow
CUDA: foreach_tensor_round_cuda
- func: _foreach_round_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_round_slow_
+ CompositeExplicitAutograd: foreach_tensor_round_slow_
CUDA: foreach_tensor_round_cuda_
autogen: _foreach_round.out
- func: _foreach_sigmoid(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sigmoid_slow
+ CompositeExplicitAutograd: foreach_tensor_sigmoid_slow
CUDA: foreach_tensor_sigmoid_cuda
- func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sigmoid_slow_
+ CompositeExplicitAutograd: foreach_tensor_sigmoid_slow_
CUDA: foreach_tensor_sigmoid_cuda_
autogen: _foreach_sigmoid.out
- func: _foreach_sign(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sign_slow
+ CompositeExplicitAutograd: foreach_tensor_sign_slow
CUDA: foreach_tensor_sign_cuda
- func: _foreach_sign_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sign_slow_
+ CompositeExplicitAutograd: foreach_tensor_sign_slow_
CUDA: foreach_tensor_sign_cuda_
autogen: _foreach_sign.out
- func: _foreach_sin(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sin_slow
+ CompositeExplicitAutograd: foreach_tensor_sin_slow
CUDA: foreach_tensor_sin_cuda
- func: _foreach_sin_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sin_slow_
+ CompositeExplicitAutograd: foreach_tensor_sin_slow_
CUDA: foreach_tensor_sin_cuda_
autogen: _foreach_sin.out
- func: _foreach_sinh(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sinh_slow
+ CompositeExplicitAutograd: foreach_tensor_sinh_slow
CUDA: foreach_tensor_sinh_cuda
- func: _foreach_sinh_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sinh_slow_
+ CompositeExplicitAutograd: foreach_tensor_sinh_slow_
CUDA: foreach_tensor_sinh_cuda_
autogen: _foreach_sinh.out
- func: _foreach_sqrt(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sqrt_slow
+ CompositeExplicitAutograd: foreach_tensor_sqrt_slow
CUDA: foreach_tensor_sqrt_cuda
- func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_sqrt_slow_
+ CompositeExplicitAutograd: foreach_tensor_sqrt_slow_
CUDA: foreach_tensor_sqrt_cuda_
autogen: _foreach_sqrt.out
- func: _foreach_tan(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_tan_slow
+ CompositeExplicitAutograd: foreach_tensor_tan_slow
CUDA: foreach_tensor_tan_cuda
- func: _foreach_tan_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_tan_slow_
+ CompositeExplicitAutograd: foreach_tensor_tan_slow_
CUDA: foreach_tensor_tan_cuda_
autogen: _foreach_tan.out
- func: _foreach_tanh(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_tanh_slow
+ CompositeExplicitAutograd: foreach_tensor_tanh_slow
CUDA: foreach_tensor_tanh_cuda
- func: _foreach_tanh_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_tanh_slow_
+ CompositeExplicitAutograd: foreach_tensor_tanh_slow_
CUDA: foreach_tensor_tanh_cuda_
autogen: _foreach_tanh.out
- func: _foreach_trunc(Tensor[] self) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_trunc_slow
+ CompositeExplicitAutograd: foreach_tensor_trunc_slow
CUDA: foreach_tensor_trunc_cuda
- func: _foreach_trunc_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_trunc_slow_
+ CompositeExplicitAutograd: foreach_tensor_trunc_slow_
CUDA: foreach_tensor_trunc_cuda_
autogen: _foreach_trunc.out
- func: _foreach_zero_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_zero_slow_
+ CompositeExplicitAutograd: foreach_tensor_zero_slow_
CUDA: foreach_tensor_zero_cuda_
autogen: _foreach_zero, _foreach_zero.out
- func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
variants: function
dispatch:
- CPU: foreach_tensor_copy_list_kernel_slow_
+ CompositeExplicitAutograd: foreach_tensor_copy_list_kernel_slow_
CUDA: foreach_tensor_copy_list_kernel_cuda_
- autogen: _foreach_copy, _foreach_copy.out
+ autogen: _foreach_copy.out
+- func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
+ device_check: NoCheck
+ variants: function
+ dispatch:
+ CompositeExplicitAutograd: _foreach_copy
+
- func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
dispatch:
CPU: bucketize_cpu
CUDA: bucketize_cuda
MPS: bucketize_mps
@@ -14560,10 +14642,20 @@
dispatch:
NestedTensorCPU: NestedTensor_to_padded_tensor_generic
NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
autogen: to_padded_tensor.out
+- func: _jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value=0.0) -> Tensor
+ variants: function
+ dispatch:
+ CUDA: _fbgemm_jagged_to_padded_dense_forward
+
+- func: _padded_dense_to_jagged_forward(Tensor dense, Tensor[] offsets, SymInt? total_L=None) -> Tensor
+ variants: function
+ dispatch:
+ CUDA: _fbgemm_dense_to_jagged_forward_symint
+
- func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
dispatch:
NestedTensorCPU: NestedTensor_softmax_dropout
NestedTensorCUDA: NestedTensor_softmax_dropout_cuda
tags: nondeterministic_seeded
@@ -14634,35 +14726,40 @@
device_check: NoCheck
dispatch:
CUDA: _scaled_dot_product_efficient_attention_backward_cuda
tags: nondeterministic_seeded
-- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
+- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
dispatch:
CUDA: _scaled_dot_product_cudnn_attention_cuda
tags: nondeterministic_seeded
-- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+- func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+ dispatch:
+ CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
+ tags: nondeterministic_seeded
+
+- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
variants: function
dispatch:
CUDA: _flash_attention_forward
tags: nondeterministic_seeded
-- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
device_check: NoCheck
variants: function
dispatch:
CUDA: _flash_attention_backward
# Returns output, logsumexp if compute_logsumexp
-- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
+- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
variants: function
dispatch:
CUDA: _efficient_attention_forward
tags: nondeterministic_seeded
-- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
+- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None, int? window_size=None, bool shared_storage_dqdkdv=False) -> (Tensor, Tensor, Tensor, Tensor)
device_check: NoCheck
variants: function
dispatch:
CUDA: _efficient_attention_backward
@@ -15458,57 +15555,68 @@
- func: _foobar(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True) -> Tensor
dispatch:
CPU: foobar
autogen: _foobar.out
-# Fused Optimizer CUDA kernels.
- func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
# Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
variants: function
dispatch:
+ CPU: _fused_adam_kernel_cpu_
CUDA: _fused_adam_kernel_cuda_
autogen: _fused_adam, _fused_adam.out
- func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
# Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
# but still skip the device check as the Tensor LR can be on CPU
device_check: NoCheck
variants: function
dispatch:
+ CPU: _fused_adam_kernel_cpu_
CUDA: _fused_adam_kernel_cuda_
autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
- func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
# Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
variants: function
dispatch:
+ CPU: _fused_adamw_kernel_cpu_
CUDA: _fused_adamw_kernel_cuda_
autogen: _fused_adamw, _fused_adamw.out
- func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
# Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
# but still skip the device check as the Tensor LR can be on CPU
device_check: NoCheck
variants: function
dispatch:
+ CPU: _fused_adamw_kernel_cpu_
CUDA: _fused_adamw_kernel_cuda_
autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
- func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
# Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
variants: function
dispatch:
+ CPU: _fused_sgd_kernel_cpu_
CUDA: _fused_sgd_kernel_cuda_
autogen: _fused_sgd, _fused_sgd.out
- func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
# Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
# but still skip the device check as the Tensor LR can be on CPU
device_check: NoCheck
variants: function
dispatch:
+ CPU: _fused_sgd_kernel_cpu_
CUDA: _fused_sgd_kernel_cuda_
autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
+
+- func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+ variants: function
+ dispatch:
+ CPU: _fused_adagrad_kernel_cpu_
+ autogen: _fused_adagrad, _fused_adagrad.out
# This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
- func: _propagate_xla_data(Tensor input, Tensor output) -> ()
variants: function