codegen/native_functions.yaml in torch-rb-0.10.2 vs codegen/native_functions.yaml in torch-rb-0.11.0

- old
+ new

@@ -143,10 +143,11 @@ dispatch: CompositeExplicitAutograd: _has_same_storage_numel - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!) variants: method + tags: inplace_view - func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a) variants: method - func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a) @@ -272,10 +273,11 @@ - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: abs_out + MPS: abs_out_mps SparseCPU, SparseCUDA: abs_sparse_out SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out # Note [Adding an alias] # To add an alias do the following: @@ -326,16 +328,16 @@ SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr_out - func: view_as_real(Tensor(a) self) -> Tensor(a) variants: function dispatch: - CPU, CUDA: view_as_real + CPU, CUDA, MPS, Meta: view_as_real - func: view_as_complex(Tensor(a) self) -> Tensor(a) variants: function dispatch: - CPU, CUDA: view_as_complex + CPU, CUDA, Meta: view_as_complex - func: sgn(Tensor self) -> Tensor variants: function, method structured_delegate: sgn.out dispatch: @@ -355,10 +357,13 @@ dispatch: CPU, CUDA: sgn_out SparseCPU, SparseCUDA: sgn_sparse_out SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out +- func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor + variants: method + - func: real(Tensor(a) self) -> Tensor(a) device_check: NoCheck # TensorIterator variants: function - func: imag(Tensor(a) self) -> Tensor(a) @@ -420,10 +425,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: acos_out + MPS: acos_out_mps # arccos, alias of acos - func: arccos(Tensor self) -> Tensor variants: function, method @@ -446,31 +452,36 @@ dispatch: SparseCPU, SparseCUDA: add_sparse SparseCsrCPU, SparseCsrCUDA: add_sparse_csr MkldnnCPU: mkldnn_add ZeroTensor: add_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method structured_delegate: add.out dispatch: SparseCPU, SparseCUDA: add_sparse_ SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_ MkldnnCPU: mkldnn_add_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase + ufunc_inner_loop: + Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf) + ScalarOnly: add (Bool) dispatch: - CPU, CUDA: add_out SparseCPU: add_out_sparse_cpu SparseCUDA: add_out_sparse_cuda SparseCsrCPU: add_out_sparse_csr_cpu SparseCsrCUDA: add_out_sparse_csr_cuda MkldnnCPU: mkldnn_add_out + MPS: add_out_mps - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor variants: function dispatch: CPU: add_relu @@ -492,10 +503,11 @@ - func: _add_relu_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) variants: function dispatch: CPU: add_relu_ + autogen: _add_relu.Scalar_out # For C++ only, until we have conversion from C++ numbers to Tensor - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -505,10 +517,11 @@ - func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: add_ + autogen: add.Scalar_out - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor structured_delegate: addmv.out variants: function, method @@ -519,12 +532,13 @@ - func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU: addmv_out_cpu CUDA: addmv_out_cuda - SparseCsrCPU: addmv_out_sparse_csr - SparseCsrCUDA: addmv_out_sparse_csr_cuda + MPS: addmv_out_mps + SparseCsrCPU: addmv_out_sparse_compressed + SparseCsrCUDA: addmv_out_sparse_compressed_cuda - func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor variants: function, method dispatch: CPU, CUDA: addr @@ -558,10 +572,11 @@ structured: True precomputed: - dim -> int dim dispatch: CPU, CUDA: all_out + MPS: all_out_mps - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -581,10 +596,11 @@ structured: True precomputed: - dim -> int dim dispatch: CPU, CUDA: any_out + MPS: any_out_mps - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -593,18 +609,25 @@ - func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +# Note [arange.start_step schema] +# We want `arange.start_step` to be grouped up with `arange.start_out`, +# But this doesn't happen automatically because the step argument +# is defaultable for .start_out but not for .start_step. +# We should probably just make "step" a defaultable param on arange.start, +# and kill arange.start_step. - func: arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!) - func: arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, Meta: arange_out CUDA: arange_cuda_out + MPS: arange_mps_out # This function is a temporary hack to allow tracing of arange like constructs with dynamic # bounds on arange. Normal arange is not traceable because it does not take any tensor inputs; # if the range you need is based on another tensor, calling this function directly will # preserve tracing. Get rid of this when arange can directly take tensors for bounds @@ -618,10 +641,11 @@ - func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU, CUDA: argmax_out + MPS: argmax_out_mps - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor structured_delegate: argmin.out device_check: NoCheck # TensorIterator variants: function, method @@ -642,10 +666,11 @@ - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: acosh_out + MPS: acosh_out_mps # arccosh, alias for acosh - func: arccosh(Tensor self) -> Tensor variants: function, method @@ -671,10 +696,11 @@ - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: asinh_out + MPS: asinh_out_mps SparseCPU, SparseCUDA: asinh_sparse_out SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out # arcsinh, alias for asinh - func: arcsinh(Tensor self) -> Tensor @@ -703,10 +729,11 @@ - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: atanh_out + MPS: atanh_out_mps SparseCPU, SparseCUDA: atanh_sparse_out SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out # arctanh, alias for atanh - func: arctanh(Tensor self) -> Tensor @@ -719,10 +746,11 @@ - func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a) variants: function, method dispatch: ZeroTensor, CPU, CUDA, Meta: as_strided_tensorimpl + MPS: as_strided_tensorimpl_mps QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl device_check: NoCheck device_guard: False - func: as_strided_(Tensor(a!) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a!) @@ -754,10 +782,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: asin_out + MPS: asin_out_mps SparseCPU, SparseCUDA: asin_sparse_out SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out # arcsin, alias of asin - func: arcsin(Tensor self) -> Tensor @@ -788,10 +817,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: atan_out + MPS: atan_out_mps SparseCPU, SparseCUDA: atan_sparse_out SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out # arctan, alias of atan - func: arctan(Tensor self) -> Tensor @@ -831,10 +861,11 @@ structured: True variants: function dispatch: CPU: baddbmm_out_cpu CUDA: baddbmm_out_cuda + MPS: baddbmm_out_mps SparseCsrCUDA: baddbmm_out_sparse_csr_cuda - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -859,23 +890,30 @@ - func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: CPU, CUDA: bernoulli_out + MPS: bernoulli_out_mps - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: bernoulli_ + MPS: bernoulli_mps_ + autogen: bernoulli.Tensor_functional, bernoulli.Tensor_out - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: bernoulli_ + MPS: bernoulli_mps_ + autogen: bernoulli.float_out +# Note [bernoulli.p schema] +# We should probably just fix the overload ambiguity by appending a _functional to the C++ API name (BC breaking) # This out-of-place version isn't used explicitly, but needed by jit. # There is no default valid on `p` here because it would introduce ambiguity # with `bernoulli(Tensor self, *, Generator? generator=None)` declaration. - func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor device_check: NoCheck # TensorIterator @@ -888,32 +926,36 @@ python_module: nn variants: function dispatch: CPU: binary_cross_entropy_cpu CUDA: binary_cross_entropy_cuda + MPS: binary_cross_entropy_mps - func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator python_module: nn variants: function dispatch: CPU: binary_cross_entropy_out_cpu CUDA: binary_cross_entropy_out_cuda + MPS: binary_cross_entropy_out_mps - func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor python_module: nn variants: function dispatch: CPU: binary_cross_entropy_backward_cpu CUDA: binary_cross_entropy_backward_cuda + MPS: binary_cross_entropy_backward_mps - func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn variants: function dispatch: CPU: binary_cross_entropy_backward_out_cpu CUDA: binary_cross_entropy_backward_out_cuda + MPS: binary_cross_entropy_backward_out_mps - func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor device_check: NoCheck # TensorIterator variants: function dispatch: @@ -1059,10 +1101,11 @@ structured: True variants: function dispatch: CPU: bmm_out_cpu CUDA: bmm_out_cuda + MPS: bmm_out_mps SparseCPU: bmm_out_sparse_cpu SparseCUDA: bmm_out_sparse_cuda SparseCsrCUDA: bmm_out_sparse_csr_cuda - func: broadcast_tensors(Tensor[] tensors) -> Tensor[] @@ -1076,16 +1119,24 @@ variants: function dispatch: SparseCPU, SparseCUDA: sparse_broadcast_to - func: cat(Tensor[] tensors, int dim=0) -> Tensor + structured_delegate: cat.out dispatch: - CompositeExplicitAutograd: cat + SparseCPU, SparseCUDA: cat_sparse + QuantizedCPU: cat_quantized_cpu - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + structured: True + precomputed: + - dim -> int dim, int valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format dispatch: - CompositeExplicitAutograd: cat_out + CPU: cat_out_cpu + CUDA: cat_out_cuda + MPS: cat_out_mps + QuantizedCPU: cat_out_quantized_cpu - func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor - func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) @@ -1123,10 +1174,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: ceil_out + MPS: ceil_out_mps SparseCPU, SparseCUDA: ceil_sparse_out SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out # alias for torch.linalg.multi_dot - func: chain_matmul(Tensor[] matrices) -> Tensor @@ -1162,12 +1214,11 @@ dispatch: QuantizedCPU: clamp_quantized_cpu - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor variants: function, method - dispatch: - CPU, CUDA: clamp + structured_delegate: clamp.Tensor_out - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method cpp_no_default_args: ['min'] @@ -1175,87 +1226,96 @@ dispatch: CompositeExplicitAutograd: clamp_ - func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!) variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_ + structured_delegate: clamp.Tensor_out - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator cpp_no_default_args: ['min'] structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: clamp_out + MPS: clamp_out_mps - func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: clamp_out + CPU, CUDA: clamp_Tensor_out + MPS: clamp_Tensor_out_mps - func: clamp_max(Tensor self, Scalar max) -> Tensor device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_max + structured_delegate: clamp_max.out - func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_max + structured_delegate: clamp_max.Tensor_out - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_max_ + structured_delegate: clamp_max.out - func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!) variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_max_ + structured_delegate: clamp_max.Tensor_out - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: clamp_max_out + MPS: clamp_max_out_mps - func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: clamp_max_out + CPU, CUDA: clamp_max_Tensor_out + MPS: clamp_max_Tensor_out_mps - func: clamp_min(Tensor self, Scalar min) -> Tensor device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_min + structured_delegate: clamp_min.out - func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_min + structured_delegate: clamp_min.Tensor_out - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_min_ + structured_delegate: clamp_min.out - func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!) variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_min_ + structured_delegate: clamp_min.Tensor_out - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: clamp_min_out + MPS: clamp_min_out_mps - func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: clamp_min_out + CPU, CUDA: clamp_min_Tensor_out + MPS: clamp_min_Tensor_out_mps # clip is an alias for clamp - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor cpp_no_default_args: ['min'] variants: function, method @@ -1358,27 +1418,33 @@ - func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor +- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor + variants: function + - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False dispatch: MkldnnCPU: copy_mkldnn_ - SparseCPU, SparseCUDA, SparseHIP: copy_sparse_wrapper_ + SparseCPU, SparseCUDA: copy_sparse_wrapper_ CompositeExplicitAutograd: copy_ - SparseCsrCPU, SparseCsrCUDA: copy_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_ + autogen: copy.out - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor - dispatch: {} + dispatch: + MPS: _copy_from_mps # We need this to be able to properly copy from a CPU to an XLA tensor with different sizes. # See https://github.com/pytorch/xla/issues/2881 - func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor - dispatch: {} + dispatch: + MPS: _copy_from_and_resize_mps - func: cos(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method structured_delegate: cos.out @@ -1392,10 +1458,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: cos_out + MPS: cos_out_mps - func: cosh(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method structured_delegate: cosh.out @@ -1409,18 +1476,20 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: cosh_out + MPS: cosh_out_mps - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor - func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor variants: function, method dispatch: CPU: count_nonzero_cpu CUDA: count_nonzero_cuda + MPS: count_nonzero_mps - func: count_nonzero(Tensor self, int? dim=None) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: count_nonzero @@ -1455,10 +1524,18 @@ - func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor dispatch: CUDA: cudnn_convolution_transpose +- func: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor + dispatch: + MPS: _mps_convolution_transpose + +- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[2] output_mask) -> (Tensor, Tensor) + dispatch: + MPS: mps_convolution_transpose_backward + - func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor dispatch: CUDA: cudnn_convolution_relu - func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor @@ -1677,10 +1754,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: div_out + MPS: div_out_mps SparseCPU, SparseCUDA: div_out_sparse_zerodim - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -1699,10 +1777,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: div_out_mode + MPS: div_out_mode_mps SparseCPU, SparseCUDA: div_out_sparse_zerodim # For C++ only, until we have conversion from C++ numbers to Tensor - func: div.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator @@ -1713,20 +1792,22 @@ - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: div_ + autogen: div.Scalar_out - func: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: div - func: div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!) variants: method dispatch: CompositeExplicitAutograd: div_ + autogen: div.Scalar_mode_out # divide, alias for div - func: divide.Tensor(Tensor self, Tensor other) -> Tensor variants: function, method @@ -1778,10 +1859,11 @@ - func: dot(Tensor self, Tensor tensor) -> Tensor variants: function, method dispatch: CPU: dot CUDA: dot_cuda + MPS: dot_mps - func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!) dispatch: CompositeExplicitAutograd: dot_out @@ -1798,22 +1880,25 @@ - func: einsum(str equation, Tensor[] tensors) -> Tensor - func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor dispatch: CompositeExplicitAutograd: embedding + NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding - func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor - func: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor dispatch: CPU: embedding_dense_backward_cpu CUDA: embedding_dense_backward_cuda + MPS: embedding_dense_backward_mps - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!) dispatch: CPU: embedding_renorm_cpu_ CUDA: embedding_renorm_cuda_ + autogen: embedding_renorm.functional, embedding_renorm.out - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor # NOTE [ embedding_bag Native Functions ] # The `_embedding_bag.*` variants assume that input tensors except for `weight`, @@ -1870,14 +1955,16 @@ - func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor dispatch: CPU: empty_cpu CUDA: empty_cuda + MPS: empty_mps Meta: empty_meta MkldnnCPU: empty_mkldnn SparseCPU, SparseCUDA: empty_sparse - SparseCsrCPU, SparseCsrCUDA: empty_sparse_csr + SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed + QuantizedCPU, QuantizedCUDA: empty_unknown_quantized # We do not make new_empty a composite that calls into new_empty_strided, as the strided version # is significantly more difficult to implement by different backends - func: new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor variants: method @@ -1918,13 +2005,25 @@ device_check: NoCheck device_guard: False dispatch: CPU, Meta: resize_ CUDA: resize_cuda_ + MPS: resize_mps_ QuantizedCPU: quantized_resize_cpu_ SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_ + autogen: resize.functional, resize.out +# This is a utility function to enable users to resize out tensor while registering kernels for out variants. +# Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration +# to make it easy to register out variants for ops. +- func: _resize_output_(Tensor(a!) self, int[] size, Device device) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: function + dispatch: + Meta: _resize_output_ + autogen: _resize_output.functional, _resize_output.out + - func: empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor category_override: factory variants: function dispatch: QuantizedCPU, QuantizedCUDA: empty_quantized @@ -1936,18 +2035,21 @@ - func: empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor device_check: NoCheck device_guard: False dispatch: CompositeExplicitAutograd: empty_like + QuantizedCPU, QuantizedCUDA: empty_like_quantized SparseCPU, SparseCUDA: empty_like_sparse_coo SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CPU: empty_strided_cpu CUDA: empty_strided_cuda + MPS: empty_strided_mps Meta: empty_strided_meta + QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized - func: erf(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: erf.out variants: function, method @@ -1967,10 +2069,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: erf_out + MPS: erf_out_mps SparseCPU, SparseCUDA: erf_sparse_out SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out - func: erfc(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -2003,10 +2106,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: exp_out + MPS: exp_out_mps - func: exp2(Tensor self) -> Tensor structured_delegate: exp2.out variants: function, method @@ -2017,10 +2121,11 @@ - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: exp2_out + MPS: exp2_out_mps - func: expm1(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: expm1.out variants: function, method @@ -2043,10 +2148,17 @@ dispatch: CPU, CUDA: expm1_out SparseCPU, SparseCUDA: expm1_sparse_out SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out +- func: expand.SymInt(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a) + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: expand_symint + - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a) variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. device_check: NoCheck device_guard: False dispatch: @@ -2063,15 +2175,17 @@ - func: eye.out(int n, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: eye_out_cpu CUDA: eye_out_cuda + MPS: eye_out_mps - func: eye.m_out(int n, int m, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: eye_out_cpu CUDA: eye_out_cuda + MPS: eye_out_mps - func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a) variants: function, method - func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a) @@ -2087,25 +2201,40 @@ variants: method - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a) variants: method +- func: fill.Scalar(Tensor self, Scalar value) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: fill + +- func: fill.Tensor(Tensor self, Tensor value) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: fill + - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: fill_ + MPS: fill_scalar_mps QuantizedCPU, QuantizedCUDA: fill_quantized_ Meta: fill_meta_ + SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_ + autogen: fill.Scalar_out - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: fill_ + MPS: fill_tensor_mps_ QuantizedCPU, QuantizedCUDA: fill_quantized_ Meta: fill_meta_ + autogen: fill.Tensor_out - func: floor(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: floor.out variants: function, method @@ -2127,10 +2256,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: floor_out + MPS: floor_out_mps SparseCPU, SparseCUDA: floor_sparse_out SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out - func: floor_divide(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator @@ -2218,14 +2348,16 @@ - func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!) structured_delegate: lcm.out variants: function, method # NOTE [ grid_sampler Native Functions ] -# `grid_sampler` does all the shape checking and then dispatches to one of -# `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which -# has the corresponding backward defined as native functions as well. Therefore, -# in these functions and their backwards, no more shape checking is done. +# `grid_sampler` is _supposed to_ do all the shape checking and then dispatch to +# one of `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of +# which has the corresponding backward defined as native functions as well. +# However, we do shape checking everywhere for now since each of the mentioned +# functions can be called directly, which will lead to crashes otherwise. +# See https://github.com/pytorch/pytorch/issues/73187 for more information. # # There is also _grid_sampler_2d_backward_cpu_fallback which is an # implementation detail of grid_sampler_2d and is only exposed here for testing # purposes. # @@ -2259,11 +2391,14 @@ - func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor dispatch: CPU: grid_sampler_3d_cpu CUDA: grid_sampler_3d_cuda -- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) +# `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for +# the case where `input` doesn't require gradient. Gradient for `grid` is always +# computed (only `output_mask[0]` is checked by the implementations). +- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor) dispatch: CPU: grid_sampler_3d_backward_cpu CUDA: grid_sampler_3d_backward_cuda - func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2353,19 +2488,25 @@ # NB: This function is special-cased in tools/autograd/gen_variable_type.py # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: # - Tensor Tensor::index(ArrayRef<TensorIndex> indices) # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices) +- func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + precomputed: + - dim -> int dim + dispatch: + CPU, CUDA: index_copy_out + - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) variants: method - dispatch: - CompositeExplicitAutograd: index_copy_ + structured_delegate: index_copy.out - func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: index_copy + structured_delegate: index_copy.out - func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!) variants: method - func: index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor @@ -2374,10 +2515,11 @@ - func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!) device_check: NoCheck # delegate to _index_put_impl_, which leverages TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: index_put_ + autogen: index_put.out # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Tensor const & rhs) # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Scalar v) # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Tensor const & rhs) # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Scalar v) @@ -2391,10 +2533,11 @@ - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: CPU, CUDA: _index_put_impl_ + autogen: _index_put_impl.functional, _index_put_impl.out - func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor variants: function - func: inverse(Tensor self) -> Tensor @@ -2442,11 +2585,11 @@ - func: isnan(Tensor self) -> Tensor variants: function, method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: isnan + CPU, CUDA, MPS: isnan SparseCPU, SparseCUDA: isnan_sparse SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr - func: is_distributed(Tensor self) -> bool variants: function, method @@ -2538,21 +2681,18 @@ - func: native_layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor) dispatch: CPU: layer_norm_cpu CUDA: layer_norm_cuda + MPS: layer_norm_mps CompositeImplicitAutograd: math_native_layer_norm -- func: _native_multi_head_self_attention(Tensor query, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor - dispatch: - CPU: multi_head_self_attention_cpu - CUDA: multi_head_self_attention_cuda - - func: native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: CPU: layer_norm_backward_cpu CUDA: layer_norm_backward_cuda + MPS: layer_norm_backward_mps - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: nan_to_num @@ -2573,10 +2713,18 @@ python_module: nn - func: linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn +# TODO: Add this function to MPS dispatch key so that we avoid declaring it in +# native_functions.yaml +# https://github.com/pytorch/pytorch/issues/77394 +- func: _mps_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor + python_module: nn + dispatch: + MPS: _mps_linear + - func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor python_module: nn dispatch: MkldnnCPU: mkldnn_linear @@ -2590,10 +2738,22 @@ - func: mkldnn_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: MkldnnCPU: mkldnn_linear_backward +- func: _mps_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor + dispatch: + MPS: _mps_linear_backward_input + +- func: _mps_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor) + dispatch: + MPS: _mps_linear_backward_weights + +- func: mps_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + MPS: mps_linear_backward + - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int) @@ -2620,10 +2780,11 @@ - func: linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, Meta: linspace_out CUDA: linspace_cuda_out + MPS: linspace_out_mps - func: log(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: log.out variants: function, method @@ -2637,10 +2798,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: log_out + MPS: log_out_mps - func: log10(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: log10.out variants: function, method @@ -2656,10 +2818,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: log10_out + MPS: log10_out_mps - func: log1p(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: log1p.out variants: function, method @@ -2679,10 +2842,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: log1p_out + MPS: log1p_out_mps SparseCPU, SparseCUDA: log1p_sparse_out SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out - func: log2(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -2698,16 +2862,18 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: log2_out + MPS: log2_out_mps - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: logaddexp_out + MPS: logaddexp_out_mps - func: logaddexp(Tensor self, Tensor other) -> Tensor variants: method, function structured_delegate: logaddexp.out dispatch: @@ -2716,10 +2882,11 @@ - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: logaddexp2_out + MPS: logaddexp2_out_mps - func: logaddexp2(Tensor self, Tensor other) -> Tensor variants: method, function structured_delegate: logaddexp2.out dispatch: @@ -2789,10 +2956,15 @@ # log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor variants: function, method +- func: log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: log_softmax_out + - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor variants: function, method - func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor structured_delegate: _log_softmax.out @@ -2800,19 +2972,21 @@ - func: _log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU: log_softmax_cpu_out CUDA: log_softmax_cuda_out + MPS: log_softmax_mps_out - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor structured_delegate: _log_softmax_backward_data.out - func: _log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU: log_softmax_backward_cpu_out CUDA: log_softmax_backward_cuda_out + MPS: log_softmax_backward_mps_out - func: _logcumsumexp(Tensor self, int dim) -> Tensor dispatch: CPU: _logcumsumexp_cpu CUDA: _logcumsumexp_cuda @@ -2920,10 +3094,11 @@ structured: True precomputed: - dim -> int dim dispatch: CPU, CUDA: max_out + MPS: max_out_mps - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) device_check: NoCheck # TensorIterator variants: function, method @@ -2935,24 +3110,35 @@ device_check: NoCheck device_guard: False - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: amax + structured_delegate: amax.out - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: CPU, CUDA: amax_out # Return: (Tensor output, Tensor indices) - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) - func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor +# TODO: Add this function to MPS dispatch key so that we avoid declaring it in +# native_functions.yaml +# https://github.com/pytorch/pytorch/issues/77394 +- func: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MPS: _mps_max_pool2d + +- func: mps_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MPS: mps_max_pool2d_backward + - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor dispatch: MkldnnCPU: mkldnn_max_pool2d - func: mkldnn_max_pool2d_backward(Tensor grad_output, Tensor output, Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor @@ -2972,10 +3158,11 @@ QuantizedCPU: quantized_max_pool1d - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor dispatch: QuantizedCPU: quantized_max_pool2d + QuantizedCUDA: quantized_max_pool2d_cudnn - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor # The CPU and GPU dispatch variants are named weirdly here because otherwise there # are namespacing issues in C++ @@ -2995,10 +3182,11 @@ - func: mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) structured: True device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: mean_out + MPS: mean_out_mps QuantizedCPU: mean_out_quantized_cpu - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -3067,27 +3255,39 @@ structured: True precomputed: - dim -> int dim dispatch: CPU, CUDA: min_out + MPS: min_out_mps - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) device_check: NoCheck # TensorIterator variants: function, method - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) device_check: NoCheck # TensorIterator - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: amin + structured_delegate: amin.out - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: CPU, CUDA: amin_out +# TODO: Add this function to MPS dispatch key so that we avoid declaring it in +# native_functions.yaml +# https://github.com/pytorch/pytorch/issues/77394 +- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor + dispatch: + MPS: _mps_convolution + +- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + MPS: mps_convolution_backward + - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor dispatch: CompositeExplicitAutograd: mkldnn_convolution - func: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor) @@ -3128,14 +3328,16 @@ - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU: mm_out_cpu CUDA: mm_out_cuda + MPS: mm_out_mps SparseCPU, SparseCUDA: _sparse_mm_out SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor + python_module: sparse - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor dispatch: SparseCPU: sparse_sparse_matmul_cpu SparseCUDA: sparse_sparse_matmul_cuda @@ -3163,43 +3365,52 @@ device_check: NoCheck # TensorIterator structured_delegate: mul.out variants: function, method dispatch: SparseCPU, SparseCUDA: mul_sparse + SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr MkldnnCPU: mkldnn_mul ZeroTensor: mul_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: mul.out variants: method dispatch: SparseCPU, SparseCUDA: mul_sparse_ + SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_ MkldnnCPU: mkldnn_mul_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: mul_out + MPS: mul_out_mps SparseCPU: mul_out_sparse_cpu SparseCUDA: mul_out_sparse_cuda + SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr MkldnnCPU: mkldnn_mul_out # For C++ only, until we have conversion from C++ numbers to Tensor - func: mul.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: mul + SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: mul_ + SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr + autogen: mul.Scalar_out # multiply, alias for mul - func: multiply.Tensor(Tensor self, Tensor other) -> Tensor variants: function, method @@ -3244,11 +3455,17 @@ variants: function, method dispatch: CPU: narrow_copy_dense_cpu SparseCPU, SparseCUDA: narrow_copy_sparse CompositeExplicitAutograd: narrow_copy_dense + tags: view_copy +- func: narrow_copy.SymInt(Tensor self, int dim, int start, SymInt length) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: narrow_copy_symint + - func: narrow_copy.out(Tensor self, int dim, int start, int length, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: narrow_copy_dense_cpu_out - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a) @@ -3263,15 +3480,17 @@ - func: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor) dispatch: CPU: batch_norm_cpu CUDA: batch_norm_cuda + MPS: batch_norm_mps MkldnnCPU: mkldnn_batch_norm - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!)) dispatch: CUDA: batch_norm_cuda_out + MPS: batch_norm_mps_out - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor) dispatch: CUDA: batch_norm_stats_cuda @@ -3294,10 +3513,11 @@ - func: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: CPU: batch_norm_backward_cpu CUDA: batch_norm_backward_cuda + MPS: batch_norm_backward_mps MkldnnCPU: mkldnn_batch_norm_backward - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor) dispatch: CUDA: batch_norm_backward_reduce_cuda @@ -3361,10 +3581,11 @@ - func: permute(Tensor(a) self, int[] dims) -> Tensor(a) variants: function, method dispatch: CompositeExplicitAutograd: permute + MPS: permute_mps - func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) variants: function, method - func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a) @@ -3401,12 +3622,18 @@ - func: adjoint(Tensor(a) self) -> Tensor(a) variants: function, method - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor + dispatch: + CPU: pixel_shuffle_cpu + CompositeExplicitAutograd: math_pixel_shuffle - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor + dispatch: + CPU: pixel_unshuffle_cpu + CompositeExplicitAutograd: math_pixel_unshuffle - func: channel_shuffle(Tensor self, int groups) -> Tensor dispatch: CPU: channel_shuffle QuantizedCPU: channel_shuffle_quantized_cpu @@ -3418,10 +3645,11 @@ - func: is_pinned(Tensor self, Device? device=None) -> bool variants: method dispatch: CUDA: is_pinned_cuda + MPS: is_pinned_mps CompositeExplicitAutograd: is_pinned_default # TODO: add a copy kwarg that guarantees that the tensor is put into fresh # pinned memory - func: pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a) @@ -3429,10 +3657,11 @@ # Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor - func: _pin_memory(Tensor self, Device? device=None) -> Tensor dispatch: CUDA: _pin_memory_cuda + MPS: _pin_memory_mps - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor variants: function, method - func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor @@ -3564,10 +3793,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: reciprocal_out + MPS: reciprocal_out_mps - func: neg(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: neg.out variants: function, method @@ -3587,10 +3817,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: neg_out + MPS: neg_out_mps SparseCPU, SparseCUDA: neg_out_sparse SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out # Alias for neg - func: negative(Tensor self) -> Tensor @@ -3603,10 +3834,11 @@ - func: repeat(Tensor self, int[] repeats) -> Tensor variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. dispatch: CompositeExplicitAutograd: repeat + MPS: repeat_mps - func: repeat_interleave.Tensor(Tensor repeats, *, int? output_size=None) -> Tensor variants: function dispatch: CPU: repeat_interleave_cpu @@ -3629,11 +3861,11 @@ - func: _reshape_alias(Tensor(a) self, int[] size, int[] stride) -> Tensor(a) variants: function, method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor: _reshape_alias + CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias # We don't need to support mkldnn since this is handled explicitly by the reshape operator. - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor device_check: NoCheck device_guard: False @@ -3666,10 +3898,11 @@ structured: True structured_inherits: TensorIteratorBase dispatch: CPU: round_out CUDA: round_out + MPS: round_out_mps SparseCPU, SparseCUDA: round_sparse_out SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out - func: round.decimals(Tensor self, *, int decimals) -> Tensor device_check: NoCheck # TensorIterator @@ -3698,65 +3931,82 @@ - func: relu(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: relu + MPS: relu_mps MkldnnCPU: mkldnn_relu QuantizedCPU: relu_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu - func: relu_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: relu_ + MPS: relu_mps_ MkldnnCPU: mkldnn_relu_ QuantizedCPU: relu_quantized_cpu_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_ + autogen: relu.out - func: relu6(Tensor self) -> Tensor python_module: nn - func: relu6_(Tensor(a!) self) -> Tensor(a!) python_module: nn - func: prelu(Tensor self, Tensor weight) -> Tensor variants: function, method dispatch: + MkldnnCPU: mkldnn_prelu CPU: prelu_cpu CUDA: prelu_cuda - func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor) variants: function, method dispatch: + MkldnnCPU: mkldnn_prelu_backward CPU: prelu_backward_cpu CUDA: prelu_backward_cuda -- func: gelu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +- func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator python_module: nn dispatch: CPU: gelu_out_cpu CUDA: gelu_out_cuda + MPS: gelu_out_mps -- func: gelu(Tensor self) -> Tensor +- func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!) structured_delegate: gelu.out device_check: NoCheck # TensorIterator python_module: nn dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_ + +- func: gelu(Tensor self, *, str approximate='none') -> Tensor + structured_delegate: gelu.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: MkldnnCPU: mkldnn_gelu QuantizedCPU: gelu_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu -- func: gelu_backward.grad_input(Tensor grad, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +- func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU: gelu_backward_out_cpu CUDA: gelu_backward_out_cuda + MPS: gelu_backward_out_mps -- func: gelu_backward(Tensor grad, Tensor self) -> Tensor +- func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor structured_delegate: gelu_backward.grad_input python_module: nn dispatch: MkldnnCPU: mkldnn_gelu_backward @@ -3802,10 +4052,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: rsqrt_out + MPS: rsqrt_out_mps - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a) variants: function, method device_check: NoCheck device_guard: False @@ -3814,10 +4065,11 @@ variants: function, method device_check: NoCheck device_guard: False dispatch: CompositeExplicitAutograd: select + SparseCsrCPU, SparseCsrCUDA: select_sparse_csr - func: select_backward(Tensor grad_output, int[] input_sizes, int dim, int index) -> Tensor variants: function device_check: NoCheck device_guard: False @@ -3837,10 +4089,11 @@ - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: CompositeExplicitAutograd: celu_ + autogen: celu.out - func: silu(Tensor self) -> Tensor structured_delegate: silu.out python_module: nn dispatch: @@ -3856,17 +4109,19 @@ structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: silu_out + MPS: silu_out_mps - func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: silu_backward_out + MPS: silu_backward_out_mps - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor structured_delegate: silu_backward.grad_input python_module: nn dispatch: @@ -3916,10 +4171,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sigmoid_out + MPS: sigmoid_out_mps - func: logit(Tensor self, float? eps=None) -> Tensor variants: function, method dispatch: CPU, CUDA: logit @@ -3953,10 +4209,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sin_out + MPS: sin_out_mps SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out SparseCPU, SparseCUDA: sin_sparse_out - func: sinc(Tensor self) -> Tensor structured_delegate: sinc.out @@ -3992,10 +4249,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sinh_out + MPS: sinh_out_mps SparseCPU, SparseCUDA: sinh_sparse_out SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out # Returns a copy of this `Variable` that is detached from its autograd graph. # This method is OK to call if the `Variable` is a view. @@ -4078,10 +4336,15 @@ # softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor variants: function, method +- func: softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: softmax_out + - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor variants: function, method - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor structured_delegate: _softmax.out @@ -4091,19 +4354,21 @@ - func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU: softmax_cpu_out CUDA: softmax_cuda_out + MPS: softmax_mps_out - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor structured_delegate: _softmax_backward_data.out - func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!) structured: True dispatch: CPU: softmax_backward_cpu_out CUDA: softmax_backward_cuda_out + MPS: softmax_backward_mps_out - func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[] variants: function, method device_check: NoCheck device_guard: False @@ -4115,10 +4380,14 @@ device_check: NoCheck device_guard: False dispatch: CompositeExplicitAutograd: split +- func: split.sizes(Tensor(a -> *) self, int[] split_size, int dim=0) -> Tensor(a)[] + variants: function, method + device_guard: False + - func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[] variants: function, method device_check: NoCheck device_guard: False dispatch: @@ -4152,19 +4421,19 @@ - func: squeeze(Tensor(a) self) -> Tensor(a) variants: function, method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: squeeze + CompositeExplicitAutograd: squeeze QuantizedCPU, QuantizedCUDA: squeeze_quantized - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a) variants: function, method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: squeeze + CompositeExplicitAutograd: squeeze QuantizedCPU, QuantizedCUDA: squeeze_quantized - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a) variants: function, method device_check: NoCheck @@ -4230,17 +4499,18 @@ - func: dstack(Tensor[] tensors) -> Tensor - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) -# The signature is designed to be consistent with librosa except that it is -# missing the `pad_mode` and `center` arguments, which are taken care of at -# `torch.functional.py`. They shall be moved here once we have mapping between -# Python strings and C++ Enum in codegen. +# Overload without center & pad mode, needed for forward-compatibility - func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor variants: function, method + cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized'] +- func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor + variants: function, method + - func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor variants: function, method - func: stride.int(Tensor self, int dim) -> int variants: function @@ -4256,10 +4526,11 @@ - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: sum + SparseCsrCPU, SparseCsrCUDA: sum_csr - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor structured_delegate: sum.IntList_out device_check: NoCheck # TensorIterator variants: function, method @@ -4271,26 +4542,22 @@ - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) structured: True device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: sum_out + MPS: sum_out_mps - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator -- func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor +- func: nansum(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method dispatch: CPU, CUDA: nansum -- func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - variants: function, method +- func: nansum.out(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU, CUDA: nansum - -- func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) - dispatch: CPU, CUDA: nansum_out - func: sum_to_size(Tensor self, int[] size) -> Tensor variants: method device_check: NoCheck @@ -4316,10 +4583,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sqrt_out + MPS: sqrt_out_mps SparseCPU, SparseCUDA: sqrt_sparse_out SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out - func: square(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -4328,12 +4596,10 @@ - func: square_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method - func: square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: square_out - func: std(Tensor self, bool unbiased=True) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -4344,10 +4610,11 @@ - func: std.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: std + MPS: std_mps - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator variants: function @@ -4395,10 +4662,11 @@ - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: prod + MPS: prod_mps - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor structured_delegate: prod.int_out device_check: NoCheck # TensorIterator variants: function, method @@ -4406,10 +4674,11 @@ - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) structured: True device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: prod_out + MPS: prod_out_mps - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -4451,10 +4720,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: tan_out + MPS: tan_out_mps SparseCPU, SparseCUDA: tan_sparse_out SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out - func: tanh(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -4479,10 +4749,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: tanh_out + MPS: tanh_out_mps SparseCPU, SparseCUDA: tanh_sparse_out SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor variants: function @@ -4509,16 +4780,18 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: threshold_out + MPS: threshold_out_mps - func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: threshold_backward_out + MPS: threshold_backward_out_mps - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor variants: function structured_delegate: threshold_backward.grad_input dispatch: @@ -4556,10 +4829,11 @@ - func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) device_check: NoCheck device_guard: False dispatch: MkldnnCPU: mkldnn_transpose_ + autogen: _mkldnn_transpose.out - func: one_hot(Tensor self, int num_classes=-1) -> Tensor python_module: nn variants: function @@ -4593,10 +4867,32 @@ - func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor +# Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads). +- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor) + dispatch: + CPU, NestedTensorCPU: transform_bias_rescale_qkv_cpu + CUDA, NestedTensorCUDA: transform_bias_rescale_qkv_cuda + +- func: _nested_tensor_from_mask(Tensor t, Tensor mask) -> Tensor + dispatch: + CPU, CUDA: NestedTensor_nested_tensor_from_mask + +- func: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor + device_check: NoCheck # cpu_nested_shape_example will always be on CPU + dispatch: + CPU: nested_from_padded_generic + CUDA: nested_from_padded_cuda + +# _nested_from_padded is not usable from Python, so +# _nested_from_padded_and_nested_example is available for testing. +- func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example + - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor dispatch: CompositeExplicitAutograd: _trilinear - func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor @@ -4623,10 +4919,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: trunc_out + MPS: trunc_out_mps SparseCPU, SparseCUDA: trunc_sparse_out SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out # Alias for trunc - func: fix(Tensor self) -> Tensor @@ -4684,11 +4981,11 @@ - func: unsqueeze(Tensor(a) self, int dim) -> Tensor(a) variants: function, method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: unsqueeze + CompositeExplicitAutograd: unsqueeze SparseCPU, SparseCUDA: unsqueeze_sparse QuantizedCPU, QuantizedCUDA: unsqueeze_quantized - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!) variants: method @@ -4711,10 +5008,11 @@ - func: var.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: var + MPS: var_mps - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator - func: var.correction_out(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) @@ -4762,17 +5060,23 @@ - func: view_as(Tensor(a) self, Tensor other) -> Tensor(a) variants: method device_check: NoCheck device_guard: False -# we define both of these because 'where' does the broadcast and '_s_where' doesn't; -# this allows us to implicitly calculate the broadcast derivative, while only dealing with the -# _s_where derivative. - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method + dispatch: + CPU, CUDA: where + MPS: where_mps +- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: where_self_out + MPS: where_self_out_mps + - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor variants: function - func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor variants: function @@ -4782,32 +5086,29 @@ - func: where(Tensor condition) -> Tensor[] device_check: NoCheck # TensorIterator variants: function -- func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor - variants: function - dispatch: - CPU, CUDA: _s_where - - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor variants: function # VariableType::_weight_norm does not want to be given a gap in the autograd graph, # so we don't define "dispatch" variants for it. - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor variants: function -- func: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) +- func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) variants: function dispatch: + CPU: weight_norm_cpu CUDA: weight_norm_cuda -- func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) +- func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) variants: function dispatch: - CUDA: weight_norm_cuda_backward + CPU: weight_norm_backward_cpu + CUDA: weight_norm_backward_cuda - func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) variants: function - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -4885,10 +5186,20 @@ - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor dispatch: SparseCPU: _sparse_sum_backward_cpu SparseCUDA: _sparse_sum_backward_cuda +- func: _sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + dispatch: + SparseCsrCPU: _sparse_csr_sum_cpu + SparseCsrCUDA: _sparse_csr_sum_cuda + +- func: _sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + dispatch: + SparseCsrCPU: _sparse_csr_prod_cpu + SparseCsrCUDA: _sparse_csr_prod_cuda + - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor python_module: sparse variants: function - func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor @@ -4960,10 +5271,11 @@ - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) structured: True device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: norm_out + MPS: norm_out_mps # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -4985,37 +5297,44 @@ - func: frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent) dispatch: CPU, CUDA: frexp_out +# Deprecated (v.1.12) - func: frobenius_norm(Tensor self) -> Tensor variants: function +# Deprecated (v.1.12) - func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor variants: function +# Deprecated (v.1.12) - func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) variants: function +# Deprecated (v.1.12) - func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor variants: function +# Deprecated (v.1.12) - func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) variants: function +# Deprecated (v.1.12) - func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor variants: function +# Deprecated (v.1.12) - func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) variants: function - func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: clone SparseCPU, SparseCUDA: clone_sparse - SparseCsrCPU, SparseCsrCUDA: clone_sparse_csr + SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed MkldnnCPU: mkldnn_clone QuantizedCPU, QuantizedCUDA: quantized_clone - func: positive(Tensor(a) self) -> Tensor(a) variants: function, method @@ -5023,41 +5342,48 @@ - func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!) use_const_ref_for_mutable_tensors: True variants: function, method dispatch: CompositeExplicitAutograd: resize_as_ + autogen: resize_as.functional, resize_as.out - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!) use_const_ref_for_mutable_tensors: True - variants: function + variants: function, method dispatch: SparseCPU, SparseCUDA: resize_as_sparse_ SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_csr_ + autogen: resize_as_sparse.functional, resize_as_sparse.out - func: zero_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method, function dispatch: CPU, CUDA: zero_ + MPS: zero_mps_ Meta: zero_meta_ SparseCPU, SparseCUDA: zero_sparse_ + SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_ MkldnnCPU: mkldnn_zero_ + autogen: zero.functional, zero.out - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sub_out + MPS: sub_out_mps SparseCPU, SparseCUDA: sub_out_sparse - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor device_check: NoCheck # TensorIterator variants: function, method structured_delegate: sub.out dispatch: SparseCPU, SparseCUDA: sub_sparse + ZeroTensor: sub_zerotensor - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method structured_delegate: sub.out @@ -5074,10 +5400,11 @@ - func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: sub_ + autogen: sub.Scalar_out # subtract, alias for sub - func: subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) - func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor @@ -5123,52 +5450,65 @@ dispatch: CompositeExplicitAutograd: rsub # Functionally the same as addmm, but we give it a different derivative formula # that doesn't propagate gradients to non-present entries on sparse. -- func: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor +- func: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor python_module: sparse dispatch: CompositeExplicitAutograd: _sparse_addmm - func: sparse_sampled_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) python_module: sparse dispatch: SparseCsrCUDA: sparse_sampled_addmm_out_sparse_csr_cuda + SparseCsrCPU: sparse_sampled_addmm_out_sparse_csr_cpu - func: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor python_module: sparse dispatch: SparseCsrCUDA: sparse_sampled_addmm_sparse_csr_cuda + SparseCsrCPU: sparse_sampled_addmm_sparse_csr_cpu - func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU: addmm_out_cpu CUDA: addmm_out_cuda + MPS: addmm_out_mps SparseCPU: addmm_out_sparse_dense_cpu SparseCUDA: addmm_out_sparse_dense_cuda - SparseCsrCPU: addmm_out_sparse_csr_cpu - SparseCsrCUDA: addmm_out_sparse_csr_cuda + SparseCsrCPU: addmm_out_sparse_compressed_cpu + SparseCsrCUDA: addmm_out_sparse_compressed_cuda - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor structured_delegate: addmm.out variants: function, method dispatch: SparseCPU: addmm_sparse_dense_cpu SparseCUDA: addmm_sparse_dense_cuda - SparseCsrCPU, SparseCsrCUDA: addmm_sparse_csr_dense + SparseCsrCPU, SparseCsrCUDA: addmm_sparse_compressed_dense - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) structured_delegate: addmm.out variants: method dispatch: # Warning! For whatever reason, the inplace sparse addmm is NON # broadcasting SparseCPU: s_addmm_sparse_dense_cpu_ SparseCUDA: s_addmm_sparse_dense_cuda_ +- func: _addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: addmm_activation_out_cpu + CUDA: addmm_activation_out_cuda + +- func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor + structured_delegate: _addmm_activation.out + variants: function, method + # NOTE [ Sparse: autograd and API ] # # # Sparse Tensor Constructors # ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -5276,15 +5616,27 @@ # shared. In other words, their outputs are non-differentiable views of the # sparse tensor. # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given # the default would never make sense. +- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -5292,11 +5644,15 @@ - func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> () +- func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> () - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> () - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor dispatch: SparseCPU, SparseCUDA: new_with_dims_sparse @@ -5307,30 +5663,38 @@ - func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) use_const_ref_for_mutable_tensors: True variants: method dispatch: SparseCPU, SparseCUDA: sparse_resize_ + autogen: sparse_resize.functional, sparse_resize.out - func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) use_const_ref_for_mutable_tensors: True variants: method dispatch: SparseCPU, SparseCUDA: sparse_resize_and_clear_ + autogen: sparse_resize_and_clear.functional, sparse_resize_and_clear.out - func: sparse_mask(Tensor self, Tensor mask) -> Tensor variants: method dispatch: SparseCPU: sparse_mask_cpu SparseCUDA: sparse_mask_cuda + SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_csr - func: _to_cpu(Tensor[] tensors) -> Tensor[] variants: function - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor variants: method + +# Special case of to_dense with custom derivative +- func: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor + variants: method dispatch: - SparseCPU, SparseCUDA, SparseCsrCPU, SparseCsrCUDA: sparse_to_dense + SparseCPU, SparseCUDA: sparse_to_dense + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense MkldnnCPU: mkldnn_to_dense - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor - func: sparse_dim(Tensor self) -> int @@ -5412,10 +5776,11 @@ variants: method dispatch: SparseCPU, SparseCUDA: _coalesced_sparse_ device_check: NoCheck device_guard: False + autogen: _coalesced.functional, _coalesced.out - func: indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: SparseCPU, SparseCUDA: indices_sparse @@ -5442,10 +5807,24 @@ dispatch: SparseCsrCPU, SparseCsrCUDA: col_indices_sparse_csr device_check: NoCheck device_guard: False +- func: ccol_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr + device_check: NoCheck + device_guard: False + +- func: row_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr + device_check: NoCheck + device_guard: False + - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) dispatch: SparseCPU: hspmm_out_sparse_cpu SparseCUDA: hspmm_out_sparse_cuda @@ -5457,29 +5836,61 @@ - func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) device_check: NoCheck # Allows copy into different device variants: function dispatch: SparseCPU, SparseCUDA: copy_sparse_ + autogen: copy_sparse_to_sparse.functional, copy_sparse_to_sparse.out - func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[] variants: function, method dispatch: CompositeExplicitAutograd: unbind + NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[] variants: function, method - func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor variants: method dispatch: CPU, CUDA: dense_to_sparse + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse - func: to_sparse(Tensor self) -> Tensor variants: method dispatch: CPU, CUDA: dense_to_sparse + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse +- func: to_sparse_csr(Tensor self) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_csr + SparseCPU, SparseCUDA: coo_to_sparse_csr + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr + +- func: to_sparse_csc(Tensor self) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_csc + SparseCPU, SparseCUDA: coo_to_sparse_csc + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc + +- func: to_sparse_bsr(Tensor self, int[2] blocksize) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_bsr + SparseCPU, SparseCUDA: coo_to_sparse_bsr + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr + +- func: to_sparse_bsc(Tensor self, int[2] blocksize) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_bsc + SparseCPU, SparseCUDA: coo_to_sparse_bsc + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc + - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor variants: method dispatch: CPU: dense_to_mkldnn @@ -5634,12 +6045,12 @@ - func: _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask) dispatch: CPU: fused_moving_avg_obs_fake_quant_cpu CUDA: fused_moving_avg_obs_fake_quant_cuda + autogen: _fused_moving_avg_obs_fq_helper.functional, _fused_moving_avg_obs_fq_helper.out - - func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int) variants: function - func: _saturate_weight_to_fp16(Tensor weight) -> Tensor variants: function @@ -5720,21 +6131,38 @@ # NB: Does NOT check precondition that numel == 1 - func: _local_scalar_dense(Tensor self) -> Scalar dispatch: CPU: _local_scalar_dense_cpu CUDA: _local_scalar_dense_cuda + MPS: _local_scalar_dense_mps variants: function +# MPS LSTM implementation + +- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + MPS: _lstm_mps + +- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[]) + dispatch: + MPS: lstm_mps_backward + + # Fused RNN kernels - func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor) dispatch: CUDA: _thnn_fused_lstm_cell_cuda -- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) +# NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs +# It is necessary to avoid triggering TensorImpl use count checks in debug mode +# NB: this is function is NOT differentiable +- func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor) dispatch: - CUDA: _thnn_fused_lstm_cell_backward_cuda + CUDA: _thnn_fused_lstm_cell_backward_impl_cuda +- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + - func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor) - func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor) dispatch: CUDA: _thnn_fused_gru_cell_cuda @@ -5810,47 +6238,68 @@ - func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: set_ + CPU, CUDA, Meta, MPS: set_ + autogen: set.source_Storage_functional, set.source_Storage_out - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False dispatch: - CPU: set_storage_cpu_ + CPU, Meta: set_storage_cpu_ CUDA: set_storage_cuda_ + MPS: set_storage_mps_ QuantizedCPU, QuantizedCUDA: set_storage_quantized_ + autogen: set.source_Storage_storage_offset_functional, set.source_Storage_storage_offset_out +- func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: set_tensor_ + CPU, CUDA, Meta, MPS: set_tensor_ + autogen: set.source_Tensor_functional, set.source_Tensor_out - func: set_(Tensor(a!) self) -> Tensor(a!) variants: method dispatch: CPU: set_cpu_ CUDA: set_cuda_ + Meta: set_meta_ + MPS: set_mps_ + autogen: set.functional, set.out +- func: lift(Tensor self) -> Tensor + variants: method + dispatch: + # Not making it CompositeImplicitAutograd because lift + # should be a primitive w.r.t. functorch + CompositeExplicitAutograd: lift + - func: is_set_to(Tensor self, Tensor tensor) -> bool variants: method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: is_set_to + CPU, CUDA, MPS: is_set_to - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda + MPS: masked_fill__mps + autogen: masked_fill.Scalar_out - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: @@ -5860,10 +6309,12 @@ device_check: NoCheck # TensorIterator variants: method dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda + MPS: masked_fill__mps + autogen: masked_fill.Tensor_out - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: @@ -5872,27 +6323,33 @@ - func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!) variants: method dispatch: CPU: masked_scatter__cpu CUDA: masked_scatter__cuda + autogen: masked_scatter.out - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: masked_scatter -- func: _masked_softmax(Tensor self, Tensor mask) -> Tensor +- func: _masked_softmax(Tensor self, Tensor mask, int? dim=None) -> Tensor dispatch: CUDA: masked_softmax_cuda CPU: masked_softmax_cpu +- func: _masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor + dispatch: + CUDA: masked_softmax_backward_cuda + CPU: masked_softmax_backward_cpu + - func: view(Tensor(a) self, int[] size) -> Tensor(a) variants: method device_check: NoCheck device_guard: False dispatch: - ZeroTensor, CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA: view + ZeroTensor, CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, MPS: view MkldnnCPU: mkldnn_view # Warning: If you want to change the name or overload name of this # operator, you might also want to change the `isBlockListedSchema` # function in `torch/csrc/jit/frontend/schema_catching.cpp`. @@ -5907,11 +6364,12 @@ CompositeExplicitAutograd: view_dtype - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!) variants: method dispatch: - CPU, CUDA: put_ + CPU, CUDA, MPS: put_ + autogen: put.out - func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor variants: function, method - func: index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -5932,16 +6390,34 @@ variants: function, method - func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor variants: function, method +- func: index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + precomputed: + - dim -> int dim + dispatch: + CPU: index_reduce_cpu_out + CUDA: index_reduce_cuda_out + +- func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!) + structured_delegate: index_reduce.out + variants: method + +- func: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor + structured_delegate: index_reduce.out + variants: function, method + - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU: index_fill_ CUDA: index_fill_ + autogen: index_fill.int_Scalar_out - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: @@ -5950,10 +6426,11 @@ - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: index_fill_ + autogen: index_fill.int_Tensor_out - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: @@ -5986,10 +6463,11 @@ - func: scatter.src_out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!) structured: True variants: function dispatch: CPU, CUDA: scatter_src_out + MPS: scatter_src_out_mps - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor structured_delegate: scatter.value_out variants: function, method @@ -6000,10 +6478,11 @@ - func: scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!) structured: True variants: function dispatch: CPU, CUDA: scatter_value_out + MPS: scatter_value_out_mps - func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor structured_delegate: scatter.reduce_out variants: function, method @@ -6014,10 +6493,11 @@ - func: scatter.reduce_out(Tensor self, int dim, Tensor index, Tensor src, *, str reduce, Tensor(a!) out) -> Tensor(a!) structured: True variants: function dispatch: CPU, CUDA: scatter_reduce_out + MPS: scatter_reduce_out_mps - func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor structured_delegate: scatter.value_reduce_out variants: function, method @@ -6028,10 +6508,11 @@ - func: scatter.value_reduce_out(Tensor self, int dim, Tensor index, Scalar value, *, str reduce, Tensor(a!) out) -> Tensor(a!) structured: True variants: function dispatch: CPU, CUDA: scatter_value_reduce_out + MPS: scatter_value_reduce_out_mps - func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor variants: function, method - func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor @@ -6048,18 +6529,28 @@ - func: scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!) structured: True variants: function dispatch: CPU, CUDA: scatter_add + MPS: scatter_add_mps_out - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor variants: function, method -- func: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor +- func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor + structured_delegate: scatter_reduce.two_out variants: function, method + +- func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!) + structured_delegate: scatter_reduce.two_out + variants: method + +- func: scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function dispatch: - CPU: scatter_reduce_two_cpu + CPU, CUDA: scatter_reduce_two - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) structured_delegate: eq.Scalar_out device_check: NoCheck # TensorIterator variants: method @@ -6091,10 +6582,16 @@ device_check: NoCheck # TensorIterator variants: method, function dispatch: CompositeExplicitAutograd: bitwise_and +- func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_and + - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function structured_delegate: bitwise_and.Tensor_out @@ -6139,10 +6636,16 @@ - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function +- func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_or + - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function structured_delegate: bitwise_or.Tensor_out @@ -6187,10 +6690,16 @@ - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function +- func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_xor + - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function structured_delegate: bitwise_xor.Tensor_out @@ -6234,16 +6743,18 @@ - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: __ilshift__ + autogen: __lshift__.Scalar_out - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: __ilshift__ + autogen: __lshift__.Tensor_out - func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method structured_delegate: bitwise_left_shift.Tensor_out @@ -6262,29 +6773,29 @@ - func: bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: bitwise_left_shift + CompositeExplicitAutograd: bitwise_left_shift - func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: - CPU, CUDA: bitwise_left_shift_ + CompositeExplicitAutograd: bitwise_left_shift_ - func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_left_shift_out + CompositeExplicitAutograd: bitwise_left_shift_out - func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_left_shift + CompositeExplicitAutograd: bitwise_left_shift - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: @@ -6299,16 +6810,18 @@ - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: __irshift__ + autogen: __rshift__.Scalar_out - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: __irshift__ + autogen: __rshift__.Tensor_out - func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method structured_delegate: bitwise_right_shift.Tensor_out @@ -6327,29 +6840,29 @@ - func: bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: bitwise_right_shift + CompositeExplicitAutograd: bitwise_right_shift - func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: - CPU, CUDA: bitwise_right_shift_ + CompositeExplicitAutograd: bitwise_right_shift_ - func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_right_shift_out + CompositeExplicitAutograd: bitwise_right_shift_out - func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_right_shift + CompositeExplicitAutograd: bitwise_right_shift - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) structured_delegate: tril.out variants: method @@ -6374,78 +6887,93 @@ - func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) variants: method dispatch: CPU, CUDA: addbmm_ + MPS: addbmm_mps_ - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: addbmm_out + MPS: addbmm_out_mps - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor variants: method, function dispatch: CPU, CUDA: addbmm + MPS: addbmm_mps - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: random_ Meta: random_meta_ + MPS: random_mps_ + autogen: random.from_functional, random.from_out - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: random_ Meta: random_meta_ + MPS: random_mps_ + autogen: random.to_functional, random.to_out - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: random_ Meta: random_meta_ + autogen: random.functional, random.out - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: uniform_ + MPS: uniform_mps_ Meta: uniform_meta_ + autogen: uniform.functional, uniform.out - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: cauchy_ + autogen: cauchy.functional, cauchy.out - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: log_normal_ + autogen: log_normal.functional, log_normal.out - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: exponential_ + autogen: exponential.functional, exponential.out - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: geometric_ # wrappers for TH functions + autogen: geometric.functional, geometric.out - func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: diag_cpu_out CUDA: diag_cuda_out + MPS: diag_mps_out - func: diag(Tensor self, int diagonal=0) -> Tensor variants: method, function dispatch: CompositeExplicitAutograd: diag @@ -6463,20 +6991,22 @@ - func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU: triu_cpu CUDA: triu_cuda + MPS: triu_mps_out - func: triu(Tensor self, int diagonal=0) -> Tensor structured_delegate: triu.out variants: method, function - func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU: tril_cpu CUDA: tril_cuda + MPS: tril_mps_out - func: tril(Tensor self, int diagonal=0) -> Tensor structured_delegate: tril.out variants: method, function @@ -6505,10 +7035,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: ne_Scalar_out + MPS: ne_scalar_out_mps QuantizedCPU: ne_out_quantized_cpu - func: ne.Scalar(Tensor self, Scalar other) -> Tensor structured_delegate: ne.Scalar_out device_check: NoCheck # TensorIterator @@ -6520,10 +7051,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: ne_Tensor_out + MPS: ne_tensor_out_mps QuantizedCPU: ne_out_quantized_cpu - func: ne.Tensor(Tensor self, Tensor other) -> Tensor structured_delegate: ne.Tensor_out device_check: NoCheck # TensorIterator @@ -6566,10 +7098,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: eq_Scalar_out + MPS: eq_scalar_out_mps QuantizedCPU: eq_out_quantized_cpu - func: eq.Scalar(Tensor self, Scalar other) -> Tensor structured_delegate: eq.Scalar_out device_check: NoCheck # TensorIterator @@ -6581,10 +7114,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: eq_Tensor_out + MPS: eq_tensor_out_mps QuantizedCPU: eq_out_quantized_cpu - func: eq.Tensor(Tensor self, Tensor other) -> Tensor structured_delegate: eq.Tensor_out device_check: NoCheck # TensorIterator @@ -6596,10 +7130,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: ge_Scalar_out + MPS: ge_scalar_out_mps QuantizedCPU: ge_out_quantized_cpu - func: ge.Scalar(Tensor self, Scalar other) -> Tensor structured_delegate: ge.Scalar_out device_check: NoCheck # TensorIterator @@ -6611,10 +7146,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: ge_Tensor_out + MPS: ge_tensor_out_mps QuantizedCPU: ge_out_quantized_cpu - func: ge.Tensor(Tensor self, Tensor other) -> Tensor structured_delegate: ge.Tensor_out device_check: NoCheck # TensorIterator @@ -6657,10 +7193,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: le_Scalar_out + MPS: le_scalar_out_mps QuantizedCPU: le_out_quantized_cpu - func: le.Scalar(Tensor self, Scalar other) -> Tensor structured_delegate: le.Scalar_out device_check: NoCheck # TensorIterator @@ -6672,10 +7209,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: le_Tensor_out + MPS: le_tensor_out_mps QuantizedCPU: le_out_quantized_cpu - func: le.Tensor(Tensor self, Tensor other) -> Tensor structured_delegate: le.Tensor_out device_check: NoCheck # TensorIterator @@ -6718,10 +7256,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: gt_Scalar_out + MPS: gt_scalar_out_mps QuantizedCPU: gt_out_quantized_cpu - func: gt.Scalar(Tensor self, Scalar other) -> Tensor structured_delegate: gt.Scalar_out device_check: NoCheck # TensorIterator @@ -6733,10 +7272,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: gt_Tensor_out + MPS: gt_tensor_out_mps QuantizedCPU: gt_out_quantized_cpu - func: gt.Tensor(Tensor self, Tensor other) -> Tensor structured_delegate: gt.Tensor_out device_check: NoCheck # TensorIterator @@ -6779,10 +7319,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: lt_Scalar_out + MPS: lt_scalar_out_mps QuantizedCPU: lt_out_quantized_cpu - func: lt.Scalar(Tensor self, Scalar other) -> Tensor structured_delegate: lt.Scalar_out device_check: NoCheck # TensorIterator @@ -6794,10 +7335,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: lt_Tensor_out + MPS: lt_tensor_out_mps QuantizedCPU: lt_out_quantized_cpu - func: lt.Tensor(Tensor self, Tensor other) -> Tensor structured_delegate: lt.Tensor_out device_check: NoCheck # TensorIterator @@ -6852,19 +7394,22 @@ - func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, QuantizedCPU: index_select_out_cpu_ CUDA, QuantizedCUDA: index_select_out_cuda + MPS: index_select_out_mps - func: index_select(Tensor self, int dim, Tensor index) -> Tensor variants: method, function dispatch: CPU: index_select_cpu_ QuantizedCPU: index_select_quantized_cpu_ - CUDA, QuantizedCUDA: index_select_cuda - SparseCPU: index_select_sparse - SparseCUDA: index_select_sparse + CUDA: index_select_cuda + QuantizedCUDA: index_select_quantized_cuda + SparseCPU: index_select_sparse_cpu + SparseCUDA: index_select_sparse_cuda + MPS: index_select_mps - func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) - func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor variants: method, function @@ -6909,10 +7454,11 @@ - func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU, CUDA: gather_out + MPS: gather_out_mps - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor variants: method, function structured_delegate: gather.out @@ -6932,10 +7478,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: addcmul_out + MPS: addcmul_out_mps - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor structured_delegate: addcmul.out device_check: NoCheck # TensorIterator variants: method, function @@ -6949,10 +7496,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: addcdiv_out + MPS: addcdiv_out_mps - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor structured_delegate: addcdiv.out device_check: NoCheck # TensorIterator variants: method, function @@ -6996,14 +7544,17 @@ dispatch: CPU, CUDA: linalg_solve_triangular_out - func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor python_module: linalg - variants: method, function + variants: function dispatch: CPU, CUDA: linalg_solve_triangular +- func: linalg_vander(Tensor x, *, int? N=None) -> Tensor + python_module: linalg + - func: symeig.e(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) dispatch: CompositeExplicitAutograd: symeig_out - func: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors) @@ -7077,25 +7628,10 @@ variants: function dispatch: CPU: _cholesky_solve_helper_cpu CUDA: _cholesky_solve_helper_cuda -- func: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU) - variants: function, method - dispatch: - CompositeExplicitAutograd: solve - -- func: solve.solution(Tensor self, Tensor A, *, Tensor(a!) solution, Tensor(b!) lu) -> (Tensor(a!) solution, Tensor(b!) LU) - dispatch: - CompositeExplicitAutograd: solve_out - -- func: _solve_helper(Tensor self, Tensor A) -> (Tensor, Tensor) - variants: function - dispatch: - CPU: _solve_helper_cpu - CUDA: _solve_helper_cuda - - func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor variants: method, function dispatch: CPU, CUDA: cholesky_inverse @@ -7142,17 +7678,18 @@ - func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor variants: method, function dispatch: CPU, CUDA: lu_solve +# lu_unpack - func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U) + structured_delegate: lu_unpack.out variants: function - dispatch: - CPU, CUDA: lu_unpack - func: lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) variants: function + structured: True dispatch: CPU, CUDA: lu_unpack_out # TODO: remove dispatch section when porting TH CUDA to ATen - func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) @@ -7272,10 +7809,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sign_out + MPS: sign_out_mps SparseCPU, SparseCUDA: sign_sparse_out SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out - func: signbit(Tensor self) -> Tensor variants: function, method @@ -7303,10 +7841,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: atan2_out + MPS: atan2_mps_out - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: atan2.out variants: method @@ -7389,10 +7928,16 @@ - func: _histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor dispatch: CPU: histogramdd_cpu +- func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + +- func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + +- func: histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: CompositeExplicitAutograd: fmod_out @@ -7526,10 +8071,11 @@ - func: min(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: CPU, CUDA: min + MPS: min_mps QuantizedCPU: min_quantized_cpu - func: fmin(Tensor self, Tensor other) -> Tensor structured_delegate: fmin.out device_check: NoCheck # TensorIterator @@ -7545,10 +8091,11 @@ - func: max(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: CPU, CUDA: max + MPS: max_mps QuantizedCPU: max_quantized_cpu - func: fmax(Tensor self, Tensor other) -> Tensor structured_delegate: fmax.out device_check: NoCheck # TensorIterator @@ -7570,10 +8117,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: maximum_out + MPS: maximum_out_mps # binary max, alias of maximum # NOTE: max is not an alias for maximum, since there is also unary max - func: max.other(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator @@ -7591,10 +8139,11 @@ structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: minimum_out + MPS: minimum_out_mps # binary min, alias for minimum # NOTE: min is not an alias for minimum, since there is also unary min - func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -7624,31 +8173,27 @@ - func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!) - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) device_check: NoCheck # TensorIterator dispatch: - CPU: sort_out_cpu - CUDA: sort_out_cuda + CompositeExplicitAutograd: sort_out - func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + structured: True dispatch: - CPU: sort_out_cpu_stable - CUDA: sort_out_stable_cuda + CPU, CUDA: sort_stable_out - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU: sort_cpu - CUDA: sort_cuda - QuantizedCPU: sort_quantized_cpu + CompositeExplicitAutograd: sort - func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) + structured_delegate: sort.values_stable variants: method, function dispatch: - CPU: sort_cpu_stable - CUDA: sort_stable_cuda QuantizedCPU: sort_quantized_cpu_stable - func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) - func: sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -7674,10 +8219,11 @@ - func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) structured: True dispatch: CPU: topk_out_cpu CUDA: topk_out_cuda + MPS: topk_out_mps - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices) variants: method, function structured_delegate: topk.values dispatch: @@ -7691,10 +8237,11 @@ - func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck structured: True dispatch: CPU, CUDA: all_all_out + MPS: all_all_out_mps - func: any(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: any.all_out variants: method, function @@ -7704,10 +8251,11 @@ - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck structured: True dispatch: CPU, CUDA: any_all_out + MPS: any_all_out_mps - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True dispatch: @@ -7726,11 +8274,11 @@ - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a) variants: method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: unfold + CPU, CUDA, Meta: unfold QuantizedCPU, QuantizedCUDA: unfold - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor variants: function dispatch: @@ -7747,10 +8295,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: pow_Tensor_Tensor_out + MPS: pow_tensor_tensor_out_mps - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: pow.Tensor_Tensor_out variants: method, function @@ -7770,10 +8319,11 @@ structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: pow_Tensor_Scalar_out SparseCPU, SparseCUDA: pow_out_sparse_scalar + MPS: pow_tensor_scalar_out_mps - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: pow.Tensor_Scalar_out variants: function, method @@ -7813,72 +8363,84 @@ - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: normal_ + MPS: normal_mps_ Meta: normal_meta_ SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_ + autogen: normal.functional, normal.out - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: normal_out + MPS: normal_mps_out + Meta: normal_out_meta - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor dispatch: CPU, CUDA: normal + #MPS: normal_mps + Meta: normal_meta - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: normal_out + Meta: normal_out_meta + MPS: normal_mps_out - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor dispatch: CPU, CUDA: normal + Meta: normal_meta + #MPS: normal_mps - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: normal_out + Meta: normal_out_meta + MPS: normal_mps_out - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor dispatch: CPU, CUDA: normal + Meta: normal_meta + #MPS: normal_mps - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: normal.float_float_out(float mean, float std, int[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) - func: alias(Tensor(a) self) -> Tensor(a) variants: method, function dispatch: CompositeExplicitAutograd: alias -- func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) - dispatch: - CPU: _index_copy_impl_ - CUDA: _index_copy_impl_ - - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> () variants: function dispatch: CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_ + autogen: _amp_foreach_non_finite_check_and_unscale.functional, _amp_foreach_non_finite_check_and_unscale.out - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!) variants: function dispatch: CUDA: _amp_update_scale_cuda_ + autogen: _amp_update_scale.functional, _amp_update_scale.out -- func: _cat(Tensor[] tensors, int dim=0) -> Tensor - dispatch: - CPU: _cat_cpu - CUDA: cat_cuda - QuantizedCPU: cat_quantized_cpu +#- func: _cat(Tensor[] tensors, int dim=0) -> Tensor + #dispatch: + #CPU: _cat_cpu + #CUDA: cat_cuda + #MPS: cat_mps + #QuantizedCPU: cat_quantized_cpu -- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU: _cat_out_cpu - CUDA: cat_out_cuda - QuantizedCPU: cat_out_quantized_cpu +#- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + #dispatch: + #CPU: _cat_out_cpu + #CUDA: cat_out_cuda + #QuantizedCPU: cat_out_quantized_cpu - func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -7889,10 +8451,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_add_scalar_kernel_slow_ CUDA: foreach_tensor_add_scalar_kernel_cuda_ + autogen: _foreach_add.Scalar_functional, _foreach_add.Scalar_out - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -7903,10 +8466,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_sub_scalar_kernel_slow_ CUDA: foreach_tensor_sub_scalar_kernel_cuda_ + autogen: _foreach_sub.Scalar_functional, _foreach_sub.Scalar_out - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -7917,10 +8481,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_mul_scalar_kernel_slow_ CUDA: foreach_tensor_mul_scalar_kernel_cuda_ + autogen: _foreach_mul.Scalar_functional, _foreach_mul.Scalar_out - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -7931,10 +8496,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_div_scalar_kernel_slow_ CUDA: foreach_tensor_div_scalar_kernel_cuda_ + autogen: _foreach_div.Scalar_functional, _foreach_div.Scalar_out - func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -7945,10 +8511,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow_ CUDA: foreach_tensor_add_list_kernel_cuda_ + autogen: _foreach_add.List_functional, _foreach_add.List_out - func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -7959,10 +8526,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_sub_list_kernel_slow_ CUDA: foreach_tensor_sub_list_kernel_cuda_ + autogen: _foreach_sub.List_functional, _foreach_sub.List_out - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -7973,10 +8541,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_mul_list_kernel_slow_ CUDA: foreach_tensor_mul_list_kernel_cuda_ + autogen: _foreach_mul.List_functional, _foreach_mul.List_out - func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -7987,10 +8556,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_div_list_kernel_slow_ CUDA: foreach_tensor_div_list_kernel_cuda_ + autogen: _foreach_div.List_functional, _foreach_div.List_out - func: _foreach_add.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8001,10 +8571,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_add_scalarlist_kernel_slow_ CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ + autogen: _foreach_add.ScalarList_functional, _foreach_add.ScalarList_out - func: _foreach_sub.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8015,10 +8586,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_sub_scalarlist_kernel_slow_ CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ + autogen: _foreach_sub.ScalarList_functional, _foreach_sub.ScalarList_out - func: _foreach_div.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8029,10 +8601,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_div_scalarlist_kernel_slow_ CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ + autogen: _foreach_div.ScalarList_functional, _foreach_div.ScalarList_out - func: _foreach_mul.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8043,10 +8616,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_mul_scalarlist_kernel_slow_ CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ + autogen: _foreach_mul.ScalarList_functional, _foreach_mul.ScalarList_out - func: _foreach_exp(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8057,17 +8631,19 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_zero_slow_ CUDA: foreach_tensor_zero_cuda_ + autogen: _foreach_zero.functional, _foreach_zero.out - func: _foreach_exp_(Tensor(a!)[] self) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_exp_slow_ CUDA: foreach_tensor_exp_cuda_ + autogen: _foreach_exp.functional, _foreach_exp.out - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8078,10 +8654,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_sqrt_slow_ CUDA: foreach_tensor_sqrt_cuda_ + autogen: _foreach_sqrt.functional, _foreach_sqrt.out - func: _foreach_abs(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8092,10 +8669,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_abs_slow_ CUDA: foreach_tensor_abs_cuda_ + autogen: _foreach_abs.functional, _foreach_abs.out - func: _foreach_acos(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8106,10 +8684,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_acos_slow_ CUDA: foreach_tensor_acos_cuda_ + autogen: _foreach_acos.functional, _foreach_acos.out - func: _foreach_asin(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8120,10 +8699,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_asin_slow_ CUDA: foreach_tensor_asin_cuda_ + autogen: _foreach_asin.functional, _foreach_asin.out - func: _foreach_atan(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8134,10 +8714,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_atan_slow_ CUDA: foreach_tensor_atan_cuda_ + autogen: _foreach_atan.functional, _foreach_atan.out - func: _foreach_ceil(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8148,10 +8729,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_ceil_slow_ CUDA: foreach_tensor_ceil_cuda_ + autogen: _foreach_ceil.functional, _foreach_ceil.out - func: _foreach_cos(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8162,10 +8744,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_cos_slow_ CUDA: foreach_tensor_cos_cuda_ + autogen: _foreach_cos.functional, _foreach_cos.out - func: _foreach_cosh(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8176,10 +8759,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_cosh_slow_ CUDA: foreach_tensor_cosh_cuda_ + autogen: _foreach_cosh.functional, _foreach_cosh.out - func: _foreach_erf(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8190,10 +8774,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_erf_slow_ CUDA: foreach_tensor_erf_cuda_ + autogen: _foreach_erf.functional, _foreach_erf.out - func: _foreach_erfc(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8204,10 +8789,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_erfc_slow_ CUDA: foreach_tensor_erfc_cuda_ + autogen: _foreach_erfc.functional, _foreach_erfc.out - func: _foreach_expm1(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8218,10 +8804,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_expm1_slow_ CUDA: foreach_tensor_expm1_cuda_ + autogen: _foreach_expm1.functional, _foreach_expm1.out - func: _foreach_floor(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8232,10 +8819,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_floor_slow_ CUDA: foreach_tensor_floor_cuda_ + autogen: _foreach_floor.functional, _foreach_floor.out - func: _foreach_log(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8246,10 +8834,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_log_slow_ CUDA: foreach_tensor_log_cuda_ + autogen: _foreach_log.functional, _foreach_log.out - func: _foreach_log10(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8260,10 +8849,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_log10_slow_ CUDA: foreach_tensor_log10_cuda_ + autogen: _foreach_log10.functional, _foreach_log10.out - func: _foreach_log1p(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8274,10 +8864,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_log1p_slow_ CUDA: foreach_tensor_log1p_cuda_ + autogen: _foreach_log1p.functional, _foreach_log1p.out - func: _foreach_log2(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8288,10 +8879,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_log2_slow_ CUDA: foreach_tensor_log2_cuda_ + autogen: _foreach_log2.functional, _foreach_log2.out - func: _foreach_neg(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8302,10 +8894,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_neg_slow_ CUDA: foreach_tensor_neg_cuda_ + autogen: _foreach_neg.functional, _foreach_neg.out - func: _foreach_tan(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8316,10 +8909,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_tan_slow_ CUDA: foreach_tensor_tan_cuda_ + autogen: _foreach_tan.functional, _foreach_tan.out - func: _foreach_tanh(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8330,10 +8924,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_tanh_slow_ CUDA: foreach_tensor_tanh_cuda_ + autogen: _foreach_tanh.functional, _foreach_tanh.out - func: _foreach_sin(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8344,10 +8939,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_sin_slow_ CUDA: foreach_tensor_sin_cuda_ + autogen: _foreach_sin.functional, _foreach_sin.out - func: _foreach_sinh(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8358,10 +8954,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_sinh_slow_ CUDA: foreach_tensor_sinh_cuda_ + autogen: _foreach_sinh.functional, _foreach_sinh.out - func: _foreach_round(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8372,10 +8969,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_round_slow_ CUDA: foreach_tensor_round_cuda_ + autogen: _foreach_round.functional, _foreach_round.out - func: _foreach_lgamma(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8386,10 +8984,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_lgamma_slow_ CUDA: foreach_tensor_lgamma_cuda_ + autogen: _foreach_lgamma.functional, _foreach_lgamma.out - func: _foreach_frac(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8400,10 +8999,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_frac_slow_ CUDA: foreach_tensor_frac_cuda_ + autogen: _foreach_frac.functional, _foreach_frac.out - func: _foreach_reciprocal(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8414,10 +9014,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_reciprocal_slow_ CUDA: foreach_tensor_reciprocal_cuda_ + autogen: _foreach_reciprocal.functional, _foreach_reciprocal.out - func: _foreach_sigmoid(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8428,10 +9029,11 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_sigmoid_slow_ CUDA: foreach_tensor_sigmoid_cuda_ + autogen: _foreach_sigmoid.functional, _foreach_sigmoid.out - func: _foreach_trunc(Tensor[] tensors) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8442,38 +9044,43 @@ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_trunc_slow_ CUDA: foreach_tensor_trunc_cuda_ + autogen: _foreach_trunc.functional, _foreach_trunc.out - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_addcdiv_scalar_slow_ CUDA: foreach_tensor_addcdiv_scalar_cuda_ + autogen: _foreach_addcdiv.Scalar_functional, _foreach_addcdiv.Scalar_out - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_addcmul_scalar_slow_ CUDA: foreach_tensor_addcmul_scalar_cuda_ + autogen: _foreach_addcmul.Scalar_functional, _foreach_addcmul.Scalar_out - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_addcdiv_scalarlist_slow_ CUDA: foreach_tensor_addcdiv_scalarlist_cuda_ + autogen: _foreach_addcdiv.ScalarList_functional, _foreach_addcdiv.ScalarList_out - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> () device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_addcmul_scalarlist_slow_ CUDA: foreach_tensor_addcmul_scalarlist_cuda_ + autogen: _foreach_addcmul.ScalarList_functional, _foreach_addcmul.ScalarList_out - func: _foreach_addcdiv.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: @@ -8582,29 +9189,33 @@ ## NN wrappers - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: mse_loss_out + MPS: mse_loss_out_mps - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor device_check: NoCheck # TensorIterator + structured_delegate: mse_loss.out python_module: nn - dispatch: - CPU, CUDA: mse_loss - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU, CUDA: mse_loss_backward_out + MPS: mse_loss_backward_out_mps - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor python_module: nn dispatch: CPU, CUDA: mse_loss_backward + MPS: mse_loss_backward_mps - func: l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CompositeExplicitAutograd: l1_loss_out @@ -8691,10 +9302,11 @@ python_module: nn structured: True dispatch: CPU: nll_loss_forward_out_cpu CUDA: nll_loss_forward_out_cuda + MPS: nll_loss_forward_out_mps - func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight) python_module: nn structured_delegate: nll_loss_forward.output @@ -8702,10 +9314,11 @@ python_module: nn structured: True dispatch: CPU: nll_loss_backward_out_cpu CUDA: nll_loss_backward_out_cuda + MPS: nll_loss_backward_out_mps - func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor python_module: nn structured_delegate: nll_loss_backward.grad_input @@ -8718,36 +9331,41 @@ - func: nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!)) python_module: nn dispatch: CPU: nll_loss2d_forward_out_cpu CUDA: nll_loss2d_forward_out_cuda + MPS: nll_loss2d_forward_out_mps - func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight) python_module: nn dispatch: CPU: nll_loss2d_forward_cpu CUDA: nll_loss2d_forward_cuda + MPS: nll_loss2d_forward_mps - func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: nll_loss2d_backward_out_cpu CUDA: nll_loss2d_backward_out_cuda + MPS: nll_loss2d_backward_out_mps - func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor python_module: nn dispatch: CPU: nll_loss2d_backward_cpu CUDA: nll_loss2d_backward_cuda + MPS: nll_loss2d_backward_mps - func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: smooth_l1_loss_out + MPS: smooth_l1_loss_out_mps - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: smooth_l1_loss.out python_module: nn @@ -8755,10 +9373,11 @@ - func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: smooth_l1_loss_backward_out CUDA: smooth_l1_loss_backward_out + MPS: smooth_l1_loss_backward_out_mps - func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor python_module: nn dispatch: CompositeExplicitAutograd: smooth_l1_loss_backward @@ -8808,10 +9427,11 @@ structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator python_module: nn dispatch: CPU, CUDA: elu_out + MPS: elu_out_mps - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor structured_delegate: elu.out device_check: NoCheck # TensorIterator python_module: nn @@ -8820,10 +9440,11 @@ structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: elu_backward_out + MPS: elu_backward_out_mps - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor structured_delegate: elu_backward.grad_input python_module: nn @@ -8856,10 +9477,20 @@ python_module: nn dispatch: CPU: glu_backward_cpu CUDA: glu_backward_cuda +- func: glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: glu_jvp + +- func: glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: glu_backward_jvp + - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator python_module: nn @@ -8892,35 +9523,37 @@ - func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator python_module: nn dispatch: - CPU, CUDA: hardtanh_out + CPU, CUDA, MPS: hardtanh_out QuantizedCPU: hardtanh_out_quantized_cpu - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor device_check: NoCheck # TensorIterator python_module: nn dispatch: - CPU, CUDA: hardtanh + CPU, CUDA, MPS: hardtanh QuantizedCPU: hardtanh_quantized_cpu - func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU, CUDA: hardtanh_backward_out + MPS: hardtanh_backward_out_mps - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor python_module: nn dispatch: CPU, CUDA: hardtanh_backward + MPS: hardtanh_backward_mps - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) device_check: NoCheck # TensorIterator python_module: nn dispatch: - CPU, CUDA: hardtanh_ + CPU, CUDA, MPS: hardtanh_ QuantizedCPU: hardtanh_quantized_cpu_ - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator python_module: nn @@ -8949,10 +9582,11 @@ structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator python_module: nn dispatch: CPU, CUDA: leaky_relu_out + MPS: leaky_relu_out_mps QuantizedCPU: leaky_relu_out_quantized_cpu - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor structured_delegate: leaky_relu.out device_check: NoCheck # TensorIterator @@ -8964,10 +9598,11 @@ structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: leaky_relu_backward_out + MPS: leaky_relu_backward_out_mps - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor structured_delegate: leaky_relu_backward.grad_input python_module: nn @@ -9086,10 +9721,11 @@ - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: adaptive_avg_pool2d_out_cpu CUDA: adaptive_avg_pool2d_out_cuda + MPS: adaptive_avg_pool2d_out_mps MkldnnCPU: mkldnn_adaptive_avg_pool2d_out - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor python_module: nn @@ -9103,17 +9739,20 @@ - func: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor dispatch: CPU: adaptive_avg_pool2d_cpu CUDA: adaptive_avg_pool2d_cuda + MPS: adaptive_avg_pool2d_mps QuantizedCPU: adaptive_avg_pool2d_quantized_cpu + QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor python_module: nn dispatch: CPU: adaptive_avg_pool2d_backward_cpu CUDA: adaptive_avg_pool2d_backward_cuda + MPS: adaptive_avg_pool2d_backward_mps - func: adaptive_avg_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: adaptive_avg_pool3d_out_cpu @@ -9146,10 +9785,11 @@ python_module: nn structured: True dispatch: CPU: adaptive_max_pool2d_out_cpu CUDA: adaptive_max_pool2d_out_cuda + MPS: adaptive_max_pool2d_out_mps # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor) python_module: nn structured_delegate: adaptive_max_pool2d.out @@ -9158,10 +9798,11 @@ python_module: nn structured: True dispatch: CPU: adaptive_max_pool2d_backward_out_cpu CUDA: adaptive_max_pool2d_backward_out_cuda + MPS: adaptive_max_pool2d_backward_out_mps - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor python_module: nn structured_delegate: adaptive_max_pool2d_backward.grad_input @@ -9197,10 +9838,11 @@ - stride -> int dH, int dW - padding -> int padH, int padW dispatch: CPU: avg_pool2d_out_cpu CUDA: avg_pool2d_out_cuda + MPS: avg_pool2d_out_mps MkldnnCPU: mkldnn_avg_pool2d_out - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor python_module: nn structured_delegate: avg_pool2d.out @@ -9212,10 +9854,11 @@ python_module: nn structured: True dispatch: CPU: avg_pool2d_backward_out_cpu CUDA: avg_pool2d_backward_out_cuda + MPS: avg_pool2d_backward_out_mps MkldnnCPU: mkldnn_avg_pool2d_backward_out - func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor python_module: nn structured_delegate: avg_pool2d_backward.grad_input @@ -9280,10 +9923,11 @@ python_module: nn structured: True precomputed: - kernel_size -> int poolSizeT, int poolSizeH, int poolSizeW - output_size -> int outputT, int outputH, int outputW + - int numBatch, int numPlanes, int inputT, int inputH, int inputW dispatch: CPU: fractional_max_pool3d_out_cpu CUDA: fractional_max_pool3d_out_cuda # Return: (Tensor output, Tensor indices) @@ -9308,10 +9952,11 @@ python_module: nn structured: True dispatch: CPU: max_pool2d_with_indices_out_cpu CUDA: max_pool2d_with_indices_out_cuda + MPS: max_pool2d_with_indices_out_mps # Return: (Tensor output, Tensor indices) - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) python_module: nn structured_delegate: max_pool2d_with_indices.out @@ -9320,10 +9965,11 @@ python_module: nn structured: True dispatch: CPU: max_pool2d_with_indices_backward_out_cpu CUDA: max_pool2d_with_indices_backward_out_cuda + MPS: max_pool2d_with_indices_backward_out_mps - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor python_module: nn structured_delegate: max_pool2d_with_indices_backward.grad_input @@ -9363,22 +10009,10 @@ python_module: nn dispatch: CPU: max_unpooling2d_forward_cpu CUDA: max_unpooling2d_forward_cuda -- func: max_unpool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) grad_input) -> Tensor(a!) - python_module: nn - dispatch: - CPU: max_unpooling2d_backward_out_cpu - CUDA: max_unpooling2d_backward_out_cuda - -- func: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor - python_module: nn - dispatch: - CPU: max_unpooling2d_backward_cpu - CUDA: max_unpooling2d_backward_cuda - - func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: max_unpooling3d_forward_out_cpu CUDA: max_unpooling3d_forward_out_cuda @@ -9387,76 +10021,71 @@ python_module: nn dispatch: CPU: max_unpooling3d_forward_cpu CUDA: max_unpooling3d_forward_cuda -- func: max_unpool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) grad_input) -> Tensor(a!) - python_module: nn - dispatch: - CPU: max_unpooling3d_backward_out_cpu - CUDA: max_unpooling3d_backward_out_cuda - -- func: max_unpool3d_backward(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor - python_module: nn - dispatch: - CPU: max_unpooling3d_backward_cpu - CUDA: max_unpooling3d_backward_cuda - - func: reflection_pad1d.out(Tensor self, int[2] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn structured: True dispatch: - CPU, QuantizedCPU: reflection_pad1d_out_cpu + CPU: reflection_pad1d_out_cpu + QuantizedCPU: reflection_pad1d_out_quantized_cpu CUDA: reflection_pad1d_out_cuda + MPS: reflection_pad1d_out_mps - func: reflection_pad1d(Tensor self, int[2] padding) -> Tensor python_module: nn structured_delegate: reflection_pad1d.out - dispatch: - QuantizedCPU: reflection_pad1d_cpu - func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, int[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn structured: True dispatch: CPU: reflection_pad1d_backward_out_cpu CUDA: reflection_pad1d_backward_out_cuda + MPS: reflection_pad1d_backward_out_mps - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor python_module: nn structured_delegate: reflection_pad1d_backward.grad_input - func: reflection_pad2d.out(Tensor self, int[4] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU, QuantizedCPU: reflection_pad2d_out_cpu CUDA: reflection_pad2d_out_cuda + MPS: reflection_pad2d_out_mps - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor python_module: nn dispatch: - CPU, QuantizedCPU: reflection_pad2d_cpu + CPU: reflection_pad2d_cpu + QuantizedCPU: reflection_pad2d_quantized_cpu CUDA: reflection_pad2d_cuda + MPS: reflection_pad2d_mps - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: reflection_pad2d_backward_out_cpu CUDA: reflection_pad2d_backward_out_cuda + MPS: reflection_pad2d_backward_out_mps - func: reflection_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor python_module: nn dispatch: CPU: reflection_pad2d_backward_cpu CUDA: reflection_pad2d_backward_cuda + MPS: reflection_pad2d_backward_mps - func: reflection_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn structured: True dispatch: CPU: reflection_pad3d_out_cpu CUDA: reflection_pad3d_out_cuda + MPS: reflection_pad3d_out_mps - func: reflection_pad3d(Tensor self, int[6] padding) -> Tensor python_module: nn structured_delegate: reflection_pad3d.out @@ -9464,10 +10093,11 @@ python_module: nn structured: True dispatch: CPU: reflection_pad3d_backward_out_cpu CUDA: reflection_pad3d_backward_out_cuda + MPS: reflection_pad3d_backward_out_mps - func: reflection_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor python_module: nn structured_delegate: reflection_pad3d_backward.grad_input @@ -9475,10 +10105,11 @@ python_module: nn structured: True dispatch: CPU: replication_pad1d_out_cpu CUDA: replication_pad1d_out_cuda + MPS: replication_pad1d_out_mps - func: replication_pad1d(Tensor self, int[2] padding) -> Tensor python_module: nn structured_delegate: replication_pad1d.out @@ -9486,10 +10117,11 @@ python_module: nn structured: True dispatch: CPU: replication_pad1d_backward_out_cpu CUDA: replication_pad1d_backward_out_cuda + MPS: replication_pad1d_backward_out_mps - func: replication_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor python_module: nn structured_delegate: replication_pad1d_backward.grad_input @@ -9497,50 +10129,65 @@ python_module: nn structured: True dispatch: CPU: replication_pad2d_out_cpu CUDA: replication_pad2d_out_cuda + MPS: replication_pad2d_out_mps - func: replication_pad2d(Tensor self, int[4] padding) -> Tensor python_module: nn structured_delegate: replication_pad2d.out - func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: replication_pad2d_backward_out_cpu CUDA: replication_pad2d_backward_out_cuda + MPS: replication_pad2d_backward_out_mps - func: replication_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor python_module: nn dispatch: CPU: replication_pad2d_backward_cpu CUDA: replication_pad2d_backward_cuda + MPS: replication_pad2d_backward_mps - func: replication_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn structured: True dispatch: CPU: replication_pad3d_out_cpu CUDA: replication_pad3d_out_cuda + MPS: replication_pad3d_out_mps - func: replication_pad3d(Tensor self, int[6] padding) -> Tensor python_module: nn structured_delegate: replication_pad3d.out - func: replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, int[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: replication_pad3d_backward_out_cpu CUDA: replication_pad3d_backward_out_cuda + MPS: replication_pad3d_backward_out_mps - func: replication_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor python_module: nn dispatch: CPU: replication_pad3d_backward_cpu CUDA: replication_pad3d_backward_cuda + MPS: replication_pad3d_backward_mps +- func: _pad_circular(Tensor self, int[] pad) -> Tensor + python_module: nn + +- func: _pad_enum(Tensor self, int[] pad, int mode, float? value=None) -> Tensor + python_module: nn + +- func: pad(Tensor self, int[] pad, str mode="constant", float? value=None) -> Tensor + python_module: nn + - func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor python_module: nn dispatch: CompositeExplicitAutograd: upsample_linear1d @@ -9692,10 +10339,11 @@ python_module: nn structured: True dispatch: CPU: upsample_bilinear2d_out_cpu CUDA: upsample_bilinear2d_out_cuda + MPS: upsample_bilinear2d_out_mps - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn structured_delegate: upsample_bilinear2d.out dispatch: @@ -9705,10 +10353,11 @@ python_module: nn structured: True dispatch: CPU: upsample_bilinear2d_backward_out_cpu CUDA: upsample_bilinear2d_backward_out_cuda + MPS: upsample_bilinear2d_backward_out_mps - func: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn structured_delegate: upsample_bilinear2d_backward.grad_input @@ -9848,17 +10497,19 @@ python_module: nn structured: True dispatch: CPU: upsample_nearest2d_out_cpu CUDA: upsample_nearest2d_out_cuda + MPS: upsample_nearest2d_out_mps - func: _upsample_nearest_exact2d.out(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn structured: True dispatch: CPU: _upsample_nearest_exact2d_out_cpu CUDA: _upsample_nearest_exact2d_out_cuda + MPS: _upsample_nearest_exact2d_out_mps - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn structured_delegate: upsample_nearest2d.out dispatch: @@ -9874,17 +10525,19 @@ python_module: nn structured: True dispatch: CPU: upsample_nearest2d_backward_out_cpu CUDA: upsample_nearest2d_backward_out_cuda + MPS: upsample_nearest2d_backward_out_mps - func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn structured: True dispatch: CPU: _upsample_nearest_exact2d_backward_out_cpu CUDA: _upsample_nearest_exact2d_backward_out_cuda + MPS: _upsample_nearest_exact2d_backward_out_mps - func: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn structured_delegate: upsample_nearest2d_backward.grad_input @@ -9944,10 +10597,11 @@ python_module: nn structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sigmoid_backward_out + MPS: sigmoid_backward_out_mps - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor python_module: nn structured_delegate: sigmoid_backward.grad_input @@ -9966,10 +10620,11 @@ python_module: nn structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: tanh_backward_out + MPS: tanh_backward_out_mps - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor python_module: nn structured_delegate: tanh_backward.grad_input @@ -10231,10 +10886,23 @@ python_module: special variants: function dispatch: CPU, CUDA: special_ndtri_out +- func: special_log_ndtr(Tensor self) -> Tensor + structured_delegate: special_log_ndtr.out + python_module: special + variants: function + +- func: special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_log_ndtr_out + - func: special_expm1(Tensor self) -> Tensor python_module: special variants: function - func: special_expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -10484,11 +11152,11 @@ - func: special_logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) python_module: special - func: special_polygamma(int n, Tensor self) -> Tensor python_module: special - variants: function, method + variants: function - func: special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: special - func: special_logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor @@ -10780,15 +11448,19 @@ variants: function - func: linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor python_module: linalg variants: function + structured_delegate: linalg_cross.out dispatch: - CPU, CUDA: linalg_cross + ZeroTensor: linalg_cross_zerotensor - func: linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!) python_module: linalg + structured: True + precomputed: + - dim -> int dim dispatch: CPU, CUDA: linalg_cross_out # linalg.lu_factor - func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots) @@ -10809,10 +11481,24 @@ variants: function structured: True dispatch: CPU, CUDA: linalg_lu_factor_ex_out +# linalg.lu +- func: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U) + python_module: linalg + structured_delegate: linalg_lu.out + variants: function + +- func: linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) + python_module: linalg + variants: function + structured: True + dispatch: + CPU, CUDA: linalg_lu_out + +# linalg.det - func: linalg_det(Tensor self) -> Tensor python_module: linalg variants: function - func: linalg_det.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -10830,10 +11516,42 @@ - func: _det_lu_based_helper_backward_helper(Tensor det_grad, Tensor det, Tensor self, Tensor lu, Tensor pivs) -> Tensor variants: function dispatch: CPU, CUDA: _det_lu_based_helper_backward_helper +- func: linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info) + structured_delegate: linalg_ldl_factor_ex.out + python_module: linalg + variants: function + +- func: linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) + structured: True + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_ldl_factor_ex_out + +- func: linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots) + python_module: linalg + variants: function + +- func: linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots) + python_module: linalg + variants: function + +- func: linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor + structured_delegate: linalg_ldl_solve.out + python_module: linalg + variants: function + +- func: linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + structured: True + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_ldl_solve_out + - func: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values) python_module: linalg variants: function dispatch: CompositeExplicitAutograd: linalg_lstsq @@ -10899,11 +11617,11 @@ - func: linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor python_module: linalg variants: function -- func: linalg_eigvalsh.out(Tensor self, str UPLO='L', *, Tensor(a!) out) -> Tensor(a!) +- func: linalg_eigvalsh.out(Tensor self, str UPLO="L", *, Tensor(a!) out) -> Tensor(a!) python_module: linalg dispatch: CPU, CUDA: linalg_eigvalsh_out - func: linalg_householder_product(Tensor input, Tensor tau) -> Tensor @@ -10920,10 +11638,11 @@ - func: _linalg_inv_out_helper_(Tensor(a!) self, Tensor(b!) infos_lu, Tensor(c!) infos_getri) -> Tensor(a!) variants: function dispatch: CPU: _linalg_inv_out_helper_cpu CUDA: _linalg_inv_out_helper_cuda + autogen: _linalg_inv_out_helper.functional, _linalg_inv_out_helper.out - func: linalg_inv_ex(Tensor self, *, bool check_errors=False) -> (Tensor inverse, Tensor info) python_module: linalg variants: function dispatch: @@ -10976,15 +11695,15 @@ variants: function - func: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor python_module: linalg variants: function - dispatch: - CPU, CUDA: linalg_vector_norm + structured_delegate: linalg_vector_norm.out - func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) python_module: linalg + structured: True dispatch: CPU, CUDA: linalg_vector_norm_out - func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor python_module: linalg @@ -11104,17 +11823,17 @@ - func: linalg_tensorsolve.out(Tensor self, Tensor other, int[]? dims=None, *, Tensor(a!) out) -> Tensor(a!) python_module: linalg variants: function -- func: linalg_qr(Tensor self, str mode='reduced') -> (Tensor Q, Tensor R) +- func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R) python_module: linalg variants: function dispatch: CompositeExplicitAutograd: linalg_qr -- func: linalg_qr.out(Tensor self, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) +- func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) python_module: linalg variants: function dispatch: CompositeExplicitAutograd: linalg_qr_out @@ -11230,5 +11949,449 @@ python_module: nn - func: unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[] variants: function python_module: nn + +- func: nested_tensor(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: function + +- func: _fw_primal_copy(Tensor self, int level) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _fw_primal_copy + tags: view_copy + +- func: _make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _make_dual_copy + tags: view_copy + +- func: view_as_real_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: view_as_real_copy + tags: view_copy + +- func: view_as_complex_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: view_as_complex_copy + tags: view_copy + +- func: _conj_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _conj_copy + tags: view_copy + +- func: _neg_view_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _neg_view_copy + tags: view_copy + +- func: as_strided_copy(Tensor self, int[] size, int[] stride, int? storage_offset=None) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: as_strided_copy + tags: view_copy + +- func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _sparse_broadcast_to_copy + tags: view_copy + +- func: diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: diagonal_copy + tags: view_copy + +- func: expand_copy(Tensor self, int[] size, *, bool implicit=False) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: expand_copy + tags: view_copy + +- func: expand_copy.SymInt(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: expand_copy_SymInt + tags: view_copy + +- func: permute_copy(Tensor self, int[] dims) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: permute_copy + tags: view_copy + +- func: _reshape_alias_copy(Tensor self, int[] size, int[] stride) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _reshape_alias_copy + tags: view_copy + +- func: select_copy.int(Tensor self, int dim, int index) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: select_copy_int + tags: view_copy + +- func: detach_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: detach_copy + tags: view_copy + +- func: slice_copy.Tensor(Tensor self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: slice_copy_Tensor + tags: view_copy + +- func: split_copy.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutograd: split_copy_Tensor + tags: view_copy + +- func: split_with_sizes_copy(Tensor self, int[] split_sizes, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutograd: split_with_sizes_copy + tags: view_copy + +- func: squeeze_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: squeeze_copy + tags: view_copy + +- func: squeeze_copy.dim(Tensor self, int dim) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: squeeze_copy_dim + tags: view_copy + +- func: t_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: t_copy + tags: view_copy + +- func: transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: transpose_copy_int + tags: view_copy + +- func: unsqueeze_copy(Tensor self, int dim) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: unsqueeze_copy + tags: view_copy + +- func: _indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _indices_copy + tags: view_copy + +- func: _values_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _values_copy + tags: view_copy + +- func: indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: indices_copy + tags: view_copy + +- func: values_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: values_copy + tags: view_copy + +- func: crow_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: crow_indices_copy + tags: view_copy + +- func: col_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: col_indices_copy + tags: view_copy + +- func: ccol_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: ccol_indices_copy + tags: view_copy + +- func: row_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: row_indices_copy + tags: view_copy + +- func: unbind_copy.int(Tensor self, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutograd: unbind_copy_int + tags: view_copy + +- func: view_copy(Tensor self, int[] size) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: view_copy + tags: view_copy + +- func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: view_copy_dtype + tags: view_copy + +- func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: unfold_copy + tags: view_copy + +- func: alias_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: alias_copy + tags: view_copy + +- func: _fw_primal_copy.out(Tensor self, int level, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _fw_primal_copy_out + + +- func: _make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _make_dual_copy_out + + +- func: view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: view_as_real_copy_out + + +- func: view_as_complex_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: view_as_complex_copy_out + + +- func: _conj_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _conj_copy_out + + +- func: _neg_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _neg_view_copy_out + + +- func: as_strided_copy.out(Tensor self, int[] size, int[] stride, int? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: as_strided_copy_out + + +- func: _sparse_broadcast_to_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _sparse_broadcast_to_copy_out + + +- func: diagonal_copy.out(Tensor self, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: diagonal_copy_out + + +- func: expand_copy.SymInt_out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: expand_copy_SymInt_out + + +- func: expand_copy.out(Tensor self, int[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: expand_copy_out + + +- func: permute_copy.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: permute_copy_out + + +- func: _reshape_alias_copy.out(Tensor self, int[] size, int[] stride, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _reshape_alias_copy_out + + +- func: select_copy.int_out(Tensor self, int dim, int index, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: select_copy_int_out + + +- func: detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: detach_copy_out + + +- func: slice_copy.Tensor_out(Tensor self, int dim=0, int? start=None, int? end=None, int step=1, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: slice_copy_Tensor_out + + +- func: split_copy.Tensor_out(Tensor self, int split_size, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: split_copy_Tensor_out + + +- func: split_with_sizes_copy.out(Tensor self, int[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: split_with_sizes_copy_out + + +- func: squeeze_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: squeeze_copy_out + + +- func: squeeze_copy.dim_out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: squeeze_copy_dim_out + + +- func: t_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: t_copy_out + + +- func: transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: transpose_copy_int_out + + +- func: unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: unsqueeze_copy_out + + +- func: _indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _indices_copy_out + + +- func: _values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _values_copy_out + + +- func: indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: indices_copy_out + + +- func: values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: values_copy_out + + +- func: crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: crow_indices_copy_out + + +- func: col_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: col_indices_copy_out + + +- func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: unbind_copy_int_out + + +- func: view_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: view_copy_out + + +- func: view_copy.dtype_out(Tensor self, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: view_copy_dtype_out + + +- func: unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: unfold_copy_out + + +- func: alias_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: alias_copy_out + +- func: to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor + variants: method + dispatch: + NestedTensorCPU: NestedTensor_to_padded_tensor_generic + NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda + +- func: _nested_tensor_layer_norm(Tensor self, Tensor? weight, Tensor? bias, float eps) -> Tensor + variants: method + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_layer_norm + +# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is. +- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None) -> Tensor + variants: function + dispatch: + CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward + +- func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True) -> (Tensor, Tensor) + variants: function + dispatch: + CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_multi_head_attention