codegen/native_functions.yaml in torch-rb-0.16.0 vs codegen/native_functions.yaml in torch-rb-0.17.0

- old
+ new

@@ -547,24 +547,24 @@ - func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: add.out variants: function, method dispatch: - SparseCPU, SparseCUDA: add_sparse - SparseCsrCPU, SparseCsrCUDA: add_sparse_csr + SparseCPU, SparseCUDA, SparseMeta: add_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr MkldnnCPU: mkldnn_add ZeroTensor: add_zerotensor NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor tags: [core, pointwise] - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method structured_delegate: add.out dispatch: - SparseCPU, SparseCUDA: add_sparse_ - SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_ + SparseCPU, SparseCUDA, SparseMeta: add_sparse_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_ MkldnnCPU: mkldnn_add_ NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor tags: pointwise - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -573,13 +573,13 @@ structured_inherits: TensorIteratorBase ufunc_inner_loop: Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf) ScalarOnly: add (Bool) dispatch: - SparseCPU: add_out_sparse_cpu + SparseCPU, SparseMeta: add_out_sparse_cpu SparseCUDA: add_out_sparse_cuda - SparseCsrCPU: add_out_sparse_compressed_cpu + SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu SparseCsrCUDA: add_out_sparse_compressed_cuda MkldnnCPU: mkldnn_add_out MPS: add_out_mps tags: pointwise @@ -1748,10 +1748,11 @@ CompositeImplicitAutograd: conv_transpose3d_symint - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor variants: function dispatch: + Meta: copy_meta CompositeExplicitAutogradNonFunctional: copy tags: core - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) variants: method @@ -3125,10 +3126,11 @@ - func: isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!) variants: function structured: True dispatch: CPU, CUDA: isin_Tensor_Tensor_out + MPS: isin_Tensor_Tensor_out_mps - func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor variants: function structured_delegate: isin.Tensor_Tensor_out @@ -3266,10 +3268,12 @@ MPS: layer_norm_backward_mps NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested autogen: native_layer_norm_backward.out tags: core +- func: rms_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor + - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: nan_to_num SparseCPU, SparseCUDA: nan_to_num_sparse @@ -3338,14 +3342,35 @@ - func: _cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int dispatch: CUDA: _cslt_sparse_mm_search +- func: _sparse_semi_structured_tile(Tensor input, str algorithm="", bool use_cutlass=True) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: _sparse_semi_structured_tile + +- func: _sparse_semi_structured_apply(Tensor input, Tensor thread_masks) -> (Tensor, Tensor) + dispatch: + CUDA: _sparse_semi_structured_apply + +- func: _sparse_semi_structured_apply_dense(Tensor input, Tensor thread_masks) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_apply_dense + +# DEPRECATED: Use torch.__sparse_semi_structured_mm/torch._sparse_semi_structured_addmm instead - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor dispatch: CUDA: _sparse_semi_structured_linear +- func: _sparse_semi_structured_mm(Tensor mat1, Tensor mat1_meta, Tensor mat2, *, ScalarType? out_dtype=None) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_mm + +- func: _sparse_semi_structured_addmm(Tensor input, Tensor mat1, Tensor mat1_meta, Tensor mat2, *, Scalar alpha=1, Scalar beta=1, ScalarType? out_dtype=None) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_addmm + - func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor dispatch: CUDA: _mixed_dtypes_linear - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor @@ -4082,29 +4107,33 @@ SparseCPU, SparseCUDA: _sparse_mm_out SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out - func: _int_mm(Tensor self, Tensor mat2) -> Tensor dispatch: + CPU: _int_mm_cpu CUDA: _int_mm_cuda - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) dispatch: + CPU: _int_mm_out_cpu CUDA: _int_mm_out_cuda - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor dispatch: CPU: _convert_weight_to_int4pack_cpu CUDA: _convert_weight_to_int4pack_cuda - func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor dispatch: CPU: _weight_int4pack_mm_cpu + MPS: _weight_int4pack_mm_mps CUDA: _weight_int4pack_mm_cuda - func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor dispatch: CPU: _weight_int8pack_mm_cpu + MPS: _weight_int8pack_mm_mps - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor python_module: sparse - func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor @@ -5395,11 +5424,11 @@ dispatch: CompositeExplicitAutograd: slice_backward autogen: slice_backward.out # NB: This op exists to back the implementation of reverse view_funcs for various views (chunk, -# slice.Tensor, split_with_sizes, et. al.). Currently, these are only used during fake-ification +# slice.Tensor, split_with_sizes, et al.). Currently, these are only used during fake-ification # of PT2 graph input subclass instances that are views. This means: # * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it) # * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it) # * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph # input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is @@ -5618,14 +5647,16 @@ SparseCUDA: _sspaddmm_out_cuda - func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor dispatch: CompositeExplicitAutograd: _chunk_cat + CUDA: _chunk_cat_cuda - func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!) dispatch: CompositeExplicitAutograd: _chunk_cat_out + CUDA: _chunk_cat_out_cuda - func: stack(Tensor[] tensors, int dim=0) -> Tensor dispatch: CompositeExplicitAutograd: stack @@ -5687,12 +5718,12 @@ - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: sum - SparseCPU, SparseCUDA: sum_coo - SparseCsrCPU, SparseCsrCUDA: sum_csr + SparseCPU, SparseCUDA, SparseMeta: sum_coo + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr autogen: sum.out - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype structured_delegate: sum.IntList_out @@ -6198,10 +6229,16 @@ - func: _nested_get_jagged_dummy(Tensor any) -> Tensor category_override: dummy dispatch: {} +- func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor) + variants: function + device_check: NoCheck + dispatch: + CPU, CUDA: _nested_compute_contiguous_strides_offsets + - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor dispatch: # calls unsqueeze CompositeExplicitAutogradNonFunctional: _trilinear autogen: _trilinear.out @@ -6463,11 +6500,11 @@ - func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CPU: _efficientzerotensor CUDA: _efficientzerotensor_cuda MPS: _efficientzerotensor_mps - Meta: _efficientzerotensor_meta + Meta: _efficientzerotensor_meta_symint autogen: _efficientzerotensor.out - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CompositeExplicitAutograd: zeros_symint @@ -6540,10 +6577,36 @@ - func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor dispatch: SparseCPU, SparseCUDA: norm_sparse autogen: native_norm.ScalarOpt_dim_dtype_out +- func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: _batch_norm_with_update_cpu + CUDA: _batch_norm_with_update_cuda + MPS: _batch_norm_with_update_mps + MkldnnCPU: _batch_norm_with_update_mkldnn + autogen: _batch_norm_with_update_functional + +- func: _batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!)) + dispatch: + CPU: _batch_norm_with_update_cpu_out + CUDA: _batch_norm_with_update_cuda_out + MPS: _batch_norm_with_update_mps_out + +- func: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CompositeExplicitAutograd: _batch_norm_no_update + autogen: _batch_norm_no_update.out + +- func: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: _new_batch_norm_backward_cpu + CUDA: _new_batch_norm_backward_cuda + MPS: _new_batch_norm_backward_mps + MkldnnCPU: _new_batch_norm_backward_mkldnn + # TODO: reduce signatures down to one when optional args is available - func: _sparse_sum(Tensor self) -> Tensor - func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor @@ -7040,10 +7103,14 @@ # shared. In other words, their outputs are non-differentiable views of the # sparse tensor. # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given # the default would never make sense. +- func: _sparse_compressed_tensor_with_dims(int nnz, int dense_dim, int[] size, int[] blocksize, ScalarType index_dtype, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CompositeExplicitAutograd: sparse_compressed_tensor_with_dims + - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor dispatch: CompositeExplicitAutograd: sparse_compressed_tensor - func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor @@ -7144,13 +7211,13 @@ - func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor - func: sparse_dim(Tensor self) -> int variants: method dispatch: - CPU, CUDA: sparse_dim_strided SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr + CompositeExplicitAutograd: sparse_dim_default device_check: NoCheck device_guard: False # legacy method - func: _dimI(Tensor self) -> int @@ -7161,13 +7228,13 @@ device_guard: False - func: dense_dim(Tensor self) -> int variants: method dispatch: - CPU, CUDA: dense_dim_strided SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr + CompositeExplicitAutograd: dense_dim_default device_check: NoCheck device_guard: False # legacy method - func: _dimV(Tensor self) -> int @@ -7294,11 +7361,11 @@ - func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) device_check: NoCheck # Allows copy into different device variants: function dispatch: - SparseCPU, SparseCUDA: copy_sparse_ + SparseCPU, SparseCUDA, SparseMeta: copy_sparse_ autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors - func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[] variants: function, method @@ -7397,11 +7464,11 @@ python_module: nn dispatch: MkldnnCPU: mkldnn_reorder_conv2d_weight autogen: mkldnn_reorder_conv2d_weight.out -- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor +- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor variants: function python_module: nn dispatch: MkldnnCPU: mkldnn_reorder_conv3d_weight autogen: mkldnn_reorder_conv3d_weight.out @@ -7645,11 +7712,11 @@ - func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType variants: function - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType -- func: can_cast(ScalarType from, ScalarType to) -> bool +- func: can_cast(ScalarType from_, ScalarType to) -> bool variants: function - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType variants: function @@ -10220,10 +10287,11 @@ - func: alias(Tensor(a) self) -> Tensor(a) variants: method, function dispatch: CompositeExplicitAutograd: alias + NestedTensorCPU, NestedTensorCUDA: alias_nested tags: core - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> () variants: function dispatch: @@ -10253,1047 +10321,1061 @@ - func: _foreach_add.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_scalar_kernel_slow + CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow CUDA: foreach_tensor_add_scalar_kernel_cuda - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_scalar_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow_ CUDA: foreach_tensor_add_scalar_kernel_cuda_ autogen: _foreach_add.Scalar_out - func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_list_kernel_slow + CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow CUDA: foreach_tensor_add_list_kernel_cuda - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow_ CUDA: foreach_tensor_add_list_kernel_cuda_ autogen: _foreach_add.List_out - func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_scalarlist_kernel_slow + CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow CUDA: foreach_tensor_add_scalarlist_kernel_cuda - func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_scalarlist_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow_ CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ autogen: _foreach_add.ScalarList_out - func: _foreach_add.Tensor(Tensor[] self, Tensor other, *, Scalar alpha=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_tensor_kernel_slow + CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow CUDA: foreach_tensor_add_tensor_kernel_cuda - func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_add_tensor_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow_ CUDA: foreach_tensor_add_tensor_kernel_cuda_ autogen: _foreach_add.Tensor_out - func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_scalar_kernel_slow + CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow CUDA: foreach_tensor_sub_scalar_kernel_cuda - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_scalar_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow_ CUDA: foreach_tensor_sub_scalar_kernel_cuda_ autogen: _foreach_sub.Scalar_out - func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_list_kernel_slow + CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow CUDA: foreach_tensor_sub_list_kernel_cuda - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow_ CUDA: foreach_tensor_sub_list_kernel_cuda_ autogen: _foreach_sub.List_out - func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_scalarlist_kernel_slow + CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow CUDA: foreach_tensor_sub_scalarlist_kernel_cuda - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sub_scalarlist_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow_ CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ autogen: _foreach_sub.ScalarList_out - func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_scalar_kernel_slow + CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow CUDA: foreach_tensor_mul_scalar_kernel_cuda - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_scalar_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow_ CUDA: foreach_tensor_mul_scalar_kernel_cuda_ autogen: _foreach_mul.Scalar_out - func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_list_kernel_slow + CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow CUDA: foreach_tensor_mul_list_kernel_cuda - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow_ CUDA: foreach_tensor_mul_list_kernel_cuda_ autogen: _foreach_mul.List_out - func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_scalarlist_kernel_slow + CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow CUDA: foreach_tensor_mul_scalarlist_kernel_cuda - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_scalarlist_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow_ CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ autogen: _foreach_mul.ScalarList_out - func: _foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_tensor_kernel_slow + CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow CUDA: foreach_tensor_mul_tensor_kernel_cuda - func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_mul_tensor_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow_ CUDA: foreach_tensor_mul_tensor_kernel_cuda_ autogen: _foreach_mul.Tensor_out - func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_scalar_kernel_slow + CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow CUDA: foreach_tensor_div_scalar_kernel_cuda - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_scalar_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow_ CUDA: foreach_tensor_div_scalar_kernel_cuda_ autogen: _foreach_div.Scalar_out - func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_list_kernel_slow + CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow CUDA: foreach_tensor_div_list_kernel_cuda - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_ CUDA: foreach_tensor_div_list_kernel_cuda_ autogen: _foreach_div.List_out - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_scalarlist_kernel_slow + CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow CUDA: foreach_tensor_div_scalarlist_kernel_cuda - func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_scalarlist_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow_ CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ autogen: _foreach_div.ScalarList_out - func: _foreach_div.Tensor(Tensor[] self, Tensor other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_tensor_kernel_slow + CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow CUDA: foreach_tensor_div_tensor_kernel_cuda - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_div_tensor_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_ CUDA: foreach_tensor_div_tensor_kernel_cuda_ autogen: _foreach_div.Tensor_out - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalar_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda - func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalar_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_ autogen: _foreach_clamp_max.Scalar_out - func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_list_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow CUDA: foreach_tensor_clamp_max_list_kernel_cuda - func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_ CUDA: foreach_tensor_clamp_max_list_kernel_cuda_ autogen: _foreach_clamp_max.List_out - func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda - func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_ CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_ autogen: _foreach_clamp_max.ScalarList_out - func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalar_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda - func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalar_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ autogen: _foreach_clamp_min.Scalar_out - func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_list_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow CUDA: foreach_tensor_clamp_min_list_kernel_cuda - func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_ CUDA: foreach_tensor_clamp_min_list_kernel_cuda_ autogen: _foreach_clamp_min.List_out - func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda - func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_ CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_ autogen: _foreach_clamp_min.ScalarList_out # foreach_minimum/maximum dispatches to clamp_max/min - func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalar_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda - func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalar_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ autogen: _foreach_maximum.Scalar_out # foreach_minimum/maximum dispatches to clamp_max/min - func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_list_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow CUDA: foreach_tensor_clamp_min_list_kernel_cuda - func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_ CUDA: foreach_tensor_clamp_min_list_kernel_cuda_ autogen: _foreach_maximum.List_out # foreach_minimum/maximum dispatches to clamp_max/min - func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda - func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_ CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_ autogen: _foreach_maximum.ScalarList_out - func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalar_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda - func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalar_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_ autogen: _foreach_minimum.Scalar_out - func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_list_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow CUDA: foreach_tensor_clamp_max_list_kernel_cuda - func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_ CUDA: foreach_tensor_clamp_max_list_kernel_cuda_ autogen: _foreach_minimum.List_out - func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda - func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_ CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_ autogen: _foreach_minimum.ScalarList_out - func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_scalar_slow + CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow CUDA: foreach_tensor_addcdiv_scalar_cuda - func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_scalarlist_slow + CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow CUDA: foreach_tensor_addcdiv_scalarlist_cuda - func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_tensor_slow + CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow CUDA: foreach_tensor_addcdiv_tensor_cuda - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_scalar_slow_ + CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow_ CUDA: foreach_tensor_addcdiv_scalar_cuda_ autogen: _foreach_addcdiv.Scalar_out - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_scalarlist_slow_ + CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow_ CUDA: foreach_tensor_addcdiv_scalarlist_cuda_ autogen: _foreach_addcdiv.ScalarList_out - func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcdiv_tensor_slow_ + CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow_ CUDA: foreach_tensor_addcdiv_tensor_cuda_ autogen: _foreach_addcdiv.Tensor_out - func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_scalar_slow + CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow CUDA: foreach_tensor_addcmul_scalar_cuda - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_scalarlist_slow + CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow CUDA: foreach_tensor_addcmul_scalarlist_cuda - func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_tensor_slow + CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow CUDA: foreach_tensor_addcmul_tensor_cuda - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_scalar_slow_ + CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow_ CUDA: foreach_tensor_addcmul_scalar_cuda_ autogen: _foreach_addcmul.Scalar_out - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_scalarlist_slow_ + CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow_ CUDA: foreach_tensor_addcmul_scalarlist_cuda_ autogen: _foreach_addcmul.ScalarList_out - func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_addcmul_tensor_slow_ + CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow_ CUDA: foreach_tensor_addcmul_tensor_cuda_ autogen: _foreach_addcmul.Tensor_out - func: _foreach_abs(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_abs_slow + CompositeExplicitAutograd: foreach_tensor_abs_slow CUDA: foreach_tensor_abs_cuda - func: _foreach_abs_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_abs_slow_ + CompositeExplicitAutograd: foreach_tensor_abs_slow_ CUDA: foreach_tensor_abs_cuda_ autogen: _foreach_abs.out - func: _foreach_acos(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_acos_slow + CompositeExplicitAutograd: foreach_tensor_acos_slow CUDA: foreach_tensor_acos_cuda - func: _foreach_acos_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_acos_slow_ + CompositeExplicitAutograd: foreach_tensor_acos_slow_ CUDA: foreach_tensor_acos_cuda_ autogen: _foreach_acos.out - func: _foreach_asin(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_asin_slow + CompositeExplicitAutograd: foreach_tensor_asin_slow CUDA: foreach_tensor_asin_cuda - func: _foreach_asin_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_asin_slow_ + CompositeExplicitAutograd: foreach_tensor_asin_slow_ CUDA: foreach_tensor_asin_cuda_ autogen: _foreach_asin.out - func: _foreach_atan(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_atan_slow + CompositeExplicitAutograd: foreach_tensor_atan_slow CUDA: foreach_tensor_atan_cuda - func: _foreach_atan_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_atan_slow_ + CompositeExplicitAutograd: foreach_tensor_atan_slow_ CUDA: foreach_tensor_atan_cuda_ autogen: _foreach_atan.out - func: _foreach_ceil(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_ceil_slow + CompositeExplicitAutograd: foreach_tensor_ceil_slow CUDA: foreach_tensor_ceil_cuda - func: _foreach_ceil_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_ceil_slow_ + CompositeExplicitAutograd: foreach_tensor_ceil_slow_ CUDA: foreach_tensor_ceil_cuda_ autogen: _foreach_ceil.out - func: _foreach_cos(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_cos_slow + CompositeExplicitAutograd: foreach_tensor_cos_slow CUDA: foreach_tensor_cos_cuda - func: _foreach_cos_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_cos_slow_ + CompositeExplicitAutograd: foreach_tensor_cos_slow_ CUDA: foreach_tensor_cos_cuda_ autogen: _foreach_cos.out - func: _foreach_cosh(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_cosh_slow + CompositeExplicitAutograd: foreach_tensor_cosh_slow CUDA: foreach_tensor_cosh_cuda - func: _foreach_cosh_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_cosh_slow_ + CompositeExplicitAutograd: foreach_tensor_cosh_slow_ CUDA: foreach_tensor_cosh_cuda_ autogen: _foreach_cosh.out - func: _foreach_erf(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_erf_slow + CompositeExplicitAutograd: foreach_tensor_erf_slow CUDA: foreach_tensor_erf_cuda - func: _foreach_erf_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_erf_slow_ + CompositeExplicitAutograd: foreach_tensor_erf_slow_ CUDA: foreach_tensor_erf_cuda_ autogen: _foreach_erf.out - func: _foreach_erfc(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_erfc_slow + CompositeExplicitAutograd: foreach_tensor_erfc_slow CUDA: foreach_tensor_erfc_cuda - func: _foreach_erfc_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_erfc_slow_ + CompositeExplicitAutograd: foreach_tensor_erfc_slow_ CUDA: foreach_tensor_erfc_cuda_ autogen: _foreach_erfc.out - func: _foreach_exp(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_exp_slow + CompositeExplicitAutograd: foreach_tensor_exp_slow CUDA: foreach_tensor_exp_cuda - func: _foreach_exp_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_exp_slow_ + CompositeExplicitAutograd: foreach_tensor_exp_slow_ CUDA: foreach_tensor_exp_cuda_ autogen: _foreach_exp.out - func: _foreach_expm1(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_expm1_slow + CompositeExplicitAutograd: foreach_tensor_expm1_slow CUDA: foreach_tensor_expm1_cuda - func: _foreach_expm1_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_expm1_slow_ + CompositeExplicitAutograd: foreach_tensor_expm1_slow_ CUDA: foreach_tensor_expm1_cuda_ autogen: _foreach_expm1.out - func: _foreach_floor(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_floor_slow + CompositeExplicitAutograd: foreach_tensor_floor_slow CUDA: foreach_tensor_floor_cuda - func: _foreach_floor_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_floor_slow_ + CompositeExplicitAutograd: foreach_tensor_floor_slow_ CUDA: foreach_tensor_floor_cuda_ autogen: _foreach_floor.out - func: _foreach_frac(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_frac_slow + CompositeExplicitAutograd: foreach_tensor_frac_slow CUDA: foreach_tensor_frac_cuda - func: _foreach_frac_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_frac_slow_ + CompositeExplicitAutograd: foreach_tensor_frac_slow_ CUDA: foreach_tensor_frac_cuda_ autogen: _foreach_frac.out - func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices variants: function dispatch: - CPU: foreach_tensor_ternary_lerp_slow + CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow CUDA: foreach_tensor_lerp_ternary_cuda autogen: _foreach_lerp.List_out - func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices variants: function dispatch: - CPU: foreach_tensor_ternary_lerp_slow_ + CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow_ CUDA: foreach_tensor_lerp_ternary_cuda_ autogen: _foreach_lerp.List_out - func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices variants: function dispatch: - CPU: foreach_tensor_lerp_list_kernel_slow + CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow CUDA: foreach_tensor_lerp_list_cuda autogen: _foreach_lerp.Scalar_out - func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices variants: function dispatch: - CPU: foreach_tensor_lerp_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow_ CUDA: foreach_tensor_lerp_list_cuda_ autogen: _foreach_lerp.Scalar_out - func: _foreach_lgamma(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_lgamma_slow + CompositeExplicitAutograd: foreach_tensor_lgamma_slow CUDA: foreach_tensor_lgamma_cuda - func: _foreach_lgamma_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_lgamma_slow_ + CompositeExplicitAutograd: foreach_tensor_lgamma_slow_ CUDA: foreach_tensor_lgamma_cuda_ autogen: _foreach_lgamma.out - func: _foreach_log(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_log_slow + CompositeExplicitAutograd: foreach_tensor_log_slow CUDA: foreach_tensor_log_cuda - func: _foreach_log_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_log_slow_ + CompositeExplicitAutograd: foreach_tensor_log_slow_ CUDA: foreach_tensor_log_cuda_ autogen: _foreach_log.out - func: _foreach_log10(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_log10_slow + CompositeExplicitAutograd: foreach_tensor_log10_slow CUDA: foreach_tensor_log10_cuda - func: _foreach_log10_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_log10_slow_ + CompositeExplicitAutograd: foreach_tensor_log10_slow_ CUDA: foreach_tensor_log10_cuda_ autogen: _foreach_log10.out - func: _foreach_log1p(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_log1p_slow + CompositeExplicitAutograd: foreach_tensor_log1p_slow CUDA: foreach_tensor_log1p_cuda - func: _foreach_log1p_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_log1p_slow_ + CompositeExplicitAutograd: foreach_tensor_log1p_slow_ CUDA: foreach_tensor_log1p_cuda_ autogen: _foreach_log1p.out - func: _foreach_log2(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_log2_slow + CompositeExplicitAutograd: foreach_tensor_log2_slow CUDA: foreach_tensor_log2_cuda - func: _foreach_log2_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_log2_slow_ + CompositeExplicitAutograd: foreach_tensor_log2_slow_ CUDA: foreach_tensor_log2_cuda_ autogen: _foreach_log2.out +- func: _foreach_max(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_max_slow + CUDA: foreach_tensor_max_cuda + autogen: _foreach_max.out + - func: _foreach_neg(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_neg_slow + CompositeExplicitAutograd: foreach_tensor_neg_slow CUDA: foreach_tensor_neg_cuda - func: _foreach_neg_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_neg_slow_ + CompositeExplicitAutograd: foreach_tensor_neg_slow_ CUDA: foreach_tensor_neg_cuda_ autogen: _foreach_neg.out -- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[] +- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_norm_slow + CompositeExplicitAutograd: foreach_tensor_norm_slow CUDA: foreach_tensor_norm_cuda autogen: _foreach_norm.Scalar_out - func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_pow_list_kernel_slow + CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow CUDA: foreach_tensor_pow_list_kernel_cuda - func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_pow_scalar_kernel_slow + CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow CUDA: foreach_tensor_pow_scalar_kernel_cuda - func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_pow_scalarlist_kernel_slow + CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow CUDA: foreach_tensor_pow_scalarlist_kernel_cuda - func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_scalar_pow_list_kernel_slow + CompositeExplicitAutograd: foreach_scalar_pow_list_kernel_slow CUDA: foreach_scalar_pow_list_kernel_cuda - func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> () device_check: NoCheck variants: function dispatch: - CPU: foreach_tensor_pow_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow_ CUDA: foreach_tensor_pow_list_kernel_cuda_ autogen: _foreach_pow.List_out - func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> () device_check: NoCheck variants: function dispatch: - CPU: foreach_tensor_pow_scalar_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow_ CUDA: foreach_tensor_pow_scalar_kernel_cuda_ autogen: _foreach_pow.Scalar_out - func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> () device_check: NoCheck variants: function dispatch: - CPU: foreach_tensor_pow_scalarlist_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow_ CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_ autogen: _foreach_pow.ScalarList_out - func: _foreach_reciprocal(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_reciprocal_slow + CompositeExplicitAutograd: foreach_tensor_reciprocal_slow CUDA: foreach_tensor_reciprocal_cuda - func: _foreach_reciprocal_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_reciprocal_slow_ + CompositeExplicitAutograd: foreach_tensor_reciprocal_slow_ CUDA: foreach_tensor_reciprocal_cuda_ autogen: _foreach_reciprocal.out - func: _foreach_round(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_round_slow + CompositeExplicitAutograd: foreach_tensor_round_slow CUDA: foreach_tensor_round_cuda - func: _foreach_round_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_round_slow_ + CompositeExplicitAutograd: foreach_tensor_round_slow_ CUDA: foreach_tensor_round_cuda_ autogen: _foreach_round.out - func: _foreach_sigmoid(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sigmoid_slow + CompositeExplicitAutograd: foreach_tensor_sigmoid_slow CUDA: foreach_tensor_sigmoid_cuda - func: _foreach_sigmoid_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sigmoid_slow_ + CompositeExplicitAutograd: foreach_tensor_sigmoid_slow_ CUDA: foreach_tensor_sigmoid_cuda_ autogen: _foreach_sigmoid.out - func: _foreach_sign(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sign_slow + CompositeExplicitAutograd: foreach_tensor_sign_slow CUDA: foreach_tensor_sign_cuda - func: _foreach_sign_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sign_slow_ + CompositeExplicitAutograd: foreach_tensor_sign_slow_ CUDA: foreach_tensor_sign_cuda_ autogen: _foreach_sign.out - func: _foreach_sin(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sin_slow + CompositeExplicitAutograd: foreach_tensor_sin_slow CUDA: foreach_tensor_sin_cuda - func: _foreach_sin_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sin_slow_ + CompositeExplicitAutograd: foreach_tensor_sin_slow_ CUDA: foreach_tensor_sin_cuda_ autogen: _foreach_sin.out - func: _foreach_sinh(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sinh_slow + CompositeExplicitAutograd: foreach_tensor_sinh_slow CUDA: foreach_tensor_sinh_cuda - func: _foreach_sinh_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sinh_slow_ + CompositeExplicitAutograd: foreach_tensor_sinh_slow_ CUDA: foreach_tensor_sinh_cuda_ autogen: _foreach_sinh.out - func: _foreach_sqrt(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sqrt_slow + CompositeExplicitAutograd: foreach_tensor_sqrt_slow CUDA: foreach_tensor_sqrt_cuda - func: _foreach_sqrt_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_sqrt_slow_ + CompositeExplicitAutograd: foreach_tensor_sqrt_slow_ CUDA: foreach_tensor_sqrt_cuda_ autogen: _foreach_sqrt.out - func: _foreach_tan(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_tan_slow + CompositeExplicitAutograd: foreach_tensor_tan_slow CUDA: foreach_tensor_tan_cuda - func: _foreach_tan_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_tan_slow_ + CompositeExplicitAutograd: foreach_tensor_tan_slow_ CUDA: foreach_tensor_tan_cuda_ autogen: _foreach_tan.out - func: _foreach_tanh(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_tanh_slow + CompositeExplicitAutograd: foreach_tensor_tanh_slow CUDA: foreach_tensor_tanh_cuda - func: _foreach_tanh_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_tanh_slow_ + CompositeExplicitAutograd: foreach_tensor_tanh_slow_ CUDA: foreach_tensor_tanh_cuda_ autogen: _foreach_tanh.out - func: _foreach_trunc(Tensor[] self) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_trunc_slow + CompositeExplicitAutograd: foreach_tensor_trunc_slow CUDA: foreach_tensor_trunc_cuda - func: _foreach_trunc_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_trunc_slow_ + CompositeExplicitAutograd: foreach_tensor_trunc_slow_ CUDA: foreach_tensor_trunc_cuda_ autogen: _foreach_trunc.out - func: _foreach_zero_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_zero_slow_ + CompositeExplicitAutograd: foreach_tensor_zero_slow_ CUDA: foreach_tensor_zero_cuda_ autogen: _foreach_zero, _foreach_zero.out - func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: - CPU: foreach_tensor_copy_list_kernel_slow_ + CompositeExplicitAutograd: foreach_tensor_copy_list_kernel_slow_ CUDA: foreach_tensor_copy_list_kernel_cuda_ - autogen: _foreach_copy, _foreach_copy.out + autogen: _foreach_copy.out +- func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out + device_check: NoCheck + variants: function + dispatch: + CompositeExplicitAutograd: _foreach_copy + - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor dispatch: CPU: bucketize_cpu CUDA: bucketize_cuda MPS: bucketize_mps @@ -14560,10 +14642,20 @@ dispatch: NestedTensorCPU: NestedTensor_to_padded_tensor_generic NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda autogen: to_padded_tensor.out +- func: _jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value=0.0) -> Tensor + variants: function + dispatch: + CUDA: _fbgemm_jagged_to_padded_dense_forward + +- func: _padded_dense_to_jagged_forward(Tensor dense, Tensor[] offsets, SymInt? total_L=None) -> Tensor + variants: function + dispatch: + CUDA: _fbgemm_dense_to_jagged_forward_symint + - func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor dispatch: NestedTensorCPU: NestedTensor_softmax_dropout NestedTensorCUDA: NestedTensor_softmax_dropout_cuda tags: nondeterministic_seeded @@ -14634,35 +14726,40 @@ device_check: NoCheck dispatch: CUDA: _scaled_dot_product_efficient_attention_backward_cuda tags: nondeterministic_seeded -- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset) +- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) dispatch: CUDA: _scaled_dot_product_cudnn_attention_cuda tags: nondeterministic_seeded -- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) +- func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: _scaled_dot_product_cudnn_attention_backward_cuda + tags: nondeterministic_seeded + +- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) variants: function dispatch: CUDA: _flash_attention_forward tags: nondeterministic_seeded -- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor) +- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor) device_check: NoCheck variants: function dispatch: CUDA: _flash_attention_backward # Returns output, logsumexp if compute_logsumexp -- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k) +- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k) variants: function dispatch: CUDA: _efficient_attention_forward tags: nondeterministic_seeded -- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor) +- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None, int? window_size=None, bool shared_storage_dqdkdv=False) -> (Tensor, Tensor, Tensor, Tensor) device_check: NoCheck variants: function dispatch: CUDA: _efficient_attention_backward @@ -15458,57 +15555,68 @@ - func: _foobar(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True) -> Tensor dispatch: CPU: foobar autogen: _foobar.out -# Fused Optimizer CUDA kernels. - func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). variants: function dispatch: + CPU: _fused_adam_kernel_cpu_ CUDA: _fused_adam_kernel_cuda_ autogen: _fused_adam, _fused_adam.out - func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now), # but still skip the device check as the Tensor LR can be on CPU device_check: NoCheck variants: function dispatch: + CPU: _fused_adam_kernel_cpu_ CUDA: _fused_adam_kernel_cuda_ autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out - func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). variants: function dispatch: + CPU: _fused_adamw_kernel_cpu_ CUDA: _fused_adamw_kernel_cuda_ autogen: _fused_adamw, _fused_adamw.out - func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now), # but still skip the device check as the Tensor LR can be on CPU device_check: NoCheck variants: function dispatch: + CPU: _fused_adamw_kernel_cpu_ CUDA: _fused_adamw_kernel_cuda_ autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out - func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). variants: function dispatch: + CPU: _fused_sgd_kernel_cpu_ CUDA: _fused_sgd_kernel_cuda_ autogen: _fused_sgd, _fused_sgd.out - func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). # but still skip the device check as the Tensor LR can be on CPU device_check: NoCheck variants: function dispatch: + CPU: _fused_sgd_kernel_cpu_ CUDA: _fused_sgd_kernel_cuda_ autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out + +- func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + variants: function + dispatch: + CPU: _fused_adagrad_kernel_cpu_ + autogen: _fused_adagrad, _fused_adagrad.out # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts. - func: _propagate_xla_data(Tensor input, Tensor output) -> () variants: function