codegen/native_functions.yaml in torch-rb-0.8.3 vs codegen/native_functions.yaml in torch-rb-0.9.0

- old
+ new

@@ -87,10 +87,14 @@ # Enables .grad attribute for non-leaf Tensors. - func: retain_grad(Tensor(a!) self) -> () manual_cpp_binding: True variants: method +- func: retains_grad(Tensor self) -> bool + manual_cpp_binding: True + variants: method + - func: _fw_primal(Tensor(a) self, int level) -> Tensor(a) variants: method dispatch: CompositeExplicitAutograd: _fw_primal @@ -276,19 +280,19 @@ dispatch: CPU, CUDA: view_as_complex - func: sgn(Tensor self) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: sgn + structured_delegate: sgn.out - func: sgn_(Tensor(a!) self) -> Tensor(a!) variants: method - dispatch: - CompositeExplicitAutograd: sgn_ + structured_delegate: sgn.out - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sgn_out - func: real(Tensor(a) self) -> Tensor(a) device_check: NoCheck # TensorIterator @@ -296,25 +300,48 @@ - func: imag(Tensor(a) self) -> Tensor(a) device_check: NoCheck # TensorIterator variants: function +- func: _conj(Tensor(a) self) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: _conj + - func: conj(Tensor(a) self) -> Tensor(a) - device_check: NoCheck # TensorIterator variants: function, method + manual_cpp_binding: True -- func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - device_check: NoCheck # TensorIterator +- func: _conj_physical(Tensor self) -> Tensor + variants: function, method dispatch: - CPU, CUDA: conj_out - SparseCPU, SparseCUDA: conj_out_sparse + CompositeExplicitAutograd: _conj_physical -- func: _conj(Tensor self) -> Tensor - variants: function +- func: conj_physical(Tensor self) -> Tensor + variants: function, method + +- func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CompositeExplicitAutograd: _conj + CPU, CUDA: conj_physical_out + SparseCPU, SparseCUDA: conj_physical_out_sparse +- func: conj_physical_(Tensor(a!) self) -> Tensor(a!) + variants: function, method + dispatch: + CompositeExplicitAutograd: conj_physical_ + +- func: resolve_conj(Tensor(a) self) -> Tensor(a) + variants: function, method + +- func: resolve_neg(Tensor(a) self) -> Tensor(a) + variants: function, method + +- func: _neg_view(Tensor(a) self) -> Tensor(a) + variants: function, method + dispatch: + CompositeExplicitAutograd: _neg_view + - func: acos(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method structured_delegate: acos.out @@ -350,20 +377,20 @@ device_check: NoCheck # TensorIterator structured_delegate: add.out variants: function, method dispatch: SparseCPU, SparseCUDA: add_sparse - SparseCsrCPU: add_sparse_csr + SparseCsrCPU, SparseCsrCUDA: add_sparse_csr MkldnnCPU: mkldnn_add - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method structured_delegate: add.out dispatch: SparseCPU, SparseCUDA: add_sparse_ - SparseCsrCPU: add_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_ MkldnnCPU: mkldnn_add_ - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True @@ -371,10 +398,11 @@ dispatch: CPU, CUDA: add_out SparseCPU: add_out_sparse_cpu SparseCUDA: add_out_sparse_cuda SparseCsrCPU: add_out_sparse_csr_cpu + SparseCsrCUDA: add_out_sparse_csr_cuda MkldnnCPU: mkldnn_add_out - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor variants: function dispatch: @@ -388,10 +416,20 @@ - func: _add_relu.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: CPU: add_relu_out +- func: _add_relu.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: add_relu + +- func: _add_relu_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + variants: function + dispatch: + CPU: add_relu_ + # For C++ only, until we have conversion from C++ numbers to Tensor - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: @@ -441,16 +479,18 @@ - func: affine_grid_generator_backward(Tensor grad, int[] size, bool align_corners) -> Tensor variants: function - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator + structured_delegate: all.out variants: function, method - dispatch: - CPU, CUDA: all - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + precomputed: + - dim -> int dim dispatch: CPU, CUDA: all_out - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator @@ -462,16 +502,18 @@ - func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool variants: function, method - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator + structured_delegate: any.out variants: function, method - dispatch: - CPU, CUDA: any - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + precomputed: + - dim -> int dim dispatch: CPU, CUDA: any_out - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator @@ -499,26 +541,26 @@ # preserve tracing. Get rid of this when arange can directly take tensors for bounds # (so that it can be traced directly). - func: _dim_arange(Tensor like, int dim) -> Tensor - func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor + structured_delegate: argmax.out device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CPU, CUDA: argmax - func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: CPU, CUDA: argmax_out - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor + structured_delegate: argmin.out device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CPU, CUDA: argmin - func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: CPU, CUDA: argmin_out - func: acosh(Tensor self) -> Tensor variants: function, method @@ -903,28 +945,18 @@ CPU: bmm_cpu CUDA: bmm_cuda SparseCPU: bmm_sparse_cpu SparseCUDA: bmm_sparse_cuda -- func: _bmm(Tensor self, Tensor mat2, *, bool deterministic=False) -> Tensor - variants: function - dispatch: - SparseCUDA: _bmm_sparse_cuda - - func: bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: CPU: bmm_out_cpu CUDA: bmm_out_cuda SparseCPU: bmm_out_sparse_cpu SparseCUDA: bmm_out_sparse_cuda -- func: _bmm.out(Tensor self, Tensor mat2, *, bool deterministic=False, Tensor(a!) out) -> Tensor(a!) - variants: function - dispatch: - SparseCUDA: _bmm_out_sparse_cuda - - func: broadcast_tensors(Tensor[] tensors) -> Tensor[] device_check: NoCheck device_guard: False - func: broadcast_to(Tensor(a) self, int[] size) -> Tensor(a) @@ -940,10 +972,19 @@ - func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor - func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) +# alias for torch.cat +- func: concat(Tensor[] tensors, int dim=0) -> Tensor + +- func: concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + +- func: concat.names(Tensor[] tensors, Dimname dim) -> Tensor + +- func: concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) + - func: block_diag(Tensor[] tensors) -> Tensor variants: function - func: ceil(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -994,12 +1035,12 @@ - func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method cpp_no_default_args: ['min'] + structured_delegate: clamp.out dispatch: - CPU, CUDA: clamp QuantizedCPU: clamp_quantized_cpu - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor variants: function, method dispatch: @@ -1007,10 +1048,11 @@ - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method cpp_no_default_args: ['min'] + structured_delegate: clamp.out dispatch: CompositeExplicitAutograd: clamp_ - func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!) variants: function, method @@ -1018,10 +1060,12 @@ CompositeExplicitAutograd: clamp_ - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator cpp_no_default_args: ['min'] + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: clamp_out - func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -1198,10 +1242,15 @@ CompositeExplicitAutograd: copy_ - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor dispatch: {} +# We need this to be able to properly copy from a CPU to an XLA tensor with different sizes. +# See https://github.com/pytorch/xla/issues/2881 +- func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor + dispatch: {} + - func: cos(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method structured_delegate: cos.out @@ -1237,17 +1286,24 @@ - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor - func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor variants: function, method dispatch: - CPU, CUDA: count_nonzero + CPU: count_nonzero_cpu + CUDA: count_nonzero_cuda - func: count_nonzero(Tensor self, int? dim=None) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: count_nonzero +- func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor + variants: function, method + +- func: corrcoef(Tensor self) -> Tensor + variants: function, method + - func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid dispatch: CUDA: cudnn_affine_grid_generator_forward # TODO: Why do I have to call this grad?! @@ -1383,24 +1439,23 @@ variants: function device_check: NoCheck device_guard: False - func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + structured_delegate: cumprod.out device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CompositeExplicitAutograd: cumprod - func: cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) + structured_delegate: cumprod.out variants: method - dispatch: - CompositeExplicitAutograd: cumprod_ - func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True device_check: NoCheck # TensorIterator dispatch: - CompositeExplicitAutograd: cumprod_out + CPU, CUDA: cumprod_out - func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -1414,24 +1469,23 @@ variants: function device_check: NoCheck device_guard: False - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + structured_delegate: cumsum.out device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CompositeExplicitAutograd: cumsum - func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) + structured_delegate: cumsum.out variants: method - dispatch: - CompositeExplicitAutograd: cumsum_ - func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True device_check: NoCheck # TensorIterator dispatch: - CompositeExplicitAutograd: cumsum_out + CPU, CUDA: cumsum_out - func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -1439,10 +1493,14 @@ variants: method - func: cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator +- func: cumulative_trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor + +- func: cumulative_trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor + - func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor # convenience function that converts to intlists for you - func: ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor @@ -1468,14 +1526,16 @@ CompositeExplicitAutograd: diagonal - func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a) variants: function, method -- func: diagonal_backward(Tensor grad, int[] input_sizes, int offset, int dim1, int dim2) -> Tensor +- func: diagonal_backward(Tensor grad_output, int[] input_sizes, int offset, int dim1, int dim2) -> Tensor variants: function device_check: NoCheck device_guard: False + dispatch: + CompositeExplicitAutograd: diagonal_backward - func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!) variants: method - func: diff(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None) -> Tensor @@ -1732,10 +1792,13 @@ variants: method - func: new_zeros(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor variants: method +- func: new_ones(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: method + # other overrides are to provide a more helpful error message that dtype is required - func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor dispatch: CPU: empty_affine_quantized_other_backends_stub QuantizedCPU, QuantizedCUDA: empty_affine_quantized @@ -1756,11 +1819,12 @@ dispatch: CPU, Meta: resize_ CUDA: resize_cuda_ QuantizedCPU: quantized_resize_cpu_ -- func: empty_quantized(int[] size, Tensor qtensor) -> Tensor +- func: empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + category_override: factory variants: function dispatch: QuantizedCPU, QuantizedCUDA: empty_quantized - func: empty.out(int[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) @@ -2212,10 +2276,40 @@ CUDA: _inverse_helper_cuda - func: isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor variants: function, method +- func: isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!) + variants: function + structured: True + dispatch: + CPU, CUDA: isin_Tensor_Tensor_out + +- func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor + variants: function + structured_delegate: isin.Tensor_Tensor_out + +- func: isin.Tensor_Scalar_out(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!) + variants: function + structured: True + dispatch: + CPU, CUDA: isin_Tensor_Scalar_out + +- func: isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor + variants: function + structured_delegate: isin.Tensor_Scalar_out + +- func: isin.Scalar_Tensor_out(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!) + variants: function + structured: True + dispatch: + CPU, CUDA: isin_Scalar_Tensor_out + +- func: isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor + variants: function + structured_delegate: isin.Scalar_Tensor_out + - func: isnan(Tensor self) -> Tensor variants: function, method device_check: NoCheck device_guard: False dispatch: @@ -2237,10 +2331,20 @@ variants: function, method device_check: NoCheck device_guard: False manual_cpp_binding: True +- func: is_conj(Tensor self) -> bool + variants: function, method + device_guard: False + manual_cpp_binding: True + +- func: is_neg(Tensor self) -> bool + variants: function, method + device_guard: False + manual_cpp_binding: True + - func: isreal(Tensor self) -> Tensor variants: function, method - func: is_nonzero(Tensor self) -> bool variants: function, method @@ -2256,10 +2360,16 @@ variants: function, method device_check: NoCheck device_guard: False manual_cpp_binding: True +- func: is_inference(Tensor self) -> bool + variants: function, method + device_check: NoCheck + device_guard: False + manual_cpp_binding: True + - func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor dispatch: CompositeExplicitAutograd: kl_div - func: kl_div_backward(Tensor grad_output, Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor @@ -2315,10 +2425,13 @@ CPU, CUDA: nan_to_num_out - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor python_module: nn +- func: linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + - func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor python_module: nn dispatch: MkldnnCPU: mkldnn_linear @@ -2462,57 +2575,57 @@ dispatch: CompositeExplicitAutograd: logaddexp2 - func: xlogy.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator + structured_delegate: xlogy.OutTensor variants: function, method - dispatch: - CPU, CUDA: xlogy - func: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: xlogy + CompositeExplicitAutograd: xlogy - func: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA: xlogy + CompositeExplicitAutograd: xlogy # xlogy: inplace variant - func: xlogy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CPU, CUDA: xlogy_ + structured_delegate: xlogy.OutTensor - func: xlogy_.Scalar_Other(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA: xlogy_ + CompositeExplicitAutograd: xlogy_ # xlogy: out variant - func: xlogy.OutTensor(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase variants: function dispatch: CPU, CUDA: xlogy_out - func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: xlogy_out + CompositeExplicitAutograd: xlogy_out - func: xlogy.OutScalar_Other(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: xlogy_out + CompositeExplicitAutograd: xlogy_out - func: logdet(Tensor self) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: logdet @@ -2530,18 +2643,26 @@ - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor variants: function, method - func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor + structured_delegate: _log_softmax.out + +- func: _log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: - CPU: log_softmax_cpu - CUDA: log_softmax_cuda + CPU: log_softmax_cpu_out + CUDA: log_softmax_cuda_out - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor + structured_delegate: _log_softmax_backward_data.out + +- func: _log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: - CPU: log_softmax_backward_cpu - CUDA: log_softmax_backward_cuda + CPU: log_softmax_backward_cpu_out + CUDA: log_softmax_backward_cuda_out - func: _logcumsumexp(Tensor self, int dim) -> Tensor dispatch: CPU: _logcumsumexp_cpu CUDA: _logcumsumexp_cuda @@ -2606,20 +2727,31 @@ dispatch: CPU, CUDA: matrix_exp - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor +# DEPRECATED: Use torch.aminmax instead - func: _aminmax(Tensor self) -> (Tensor, Tensor) - variants: function dispatch: CPU, CUDA: _aminmax_all +# DEPRECATED: Use torch.aminmax instead - func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor) - variants: function dispatch: CPU, CUDA: _aminmax +- func: aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max) + device_check: NoCheck # TensorIterator + structured_delegate: aminmax.out + variants: function, method + +- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: aminmax_out + - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor dispatch: CPU, CUDA: _compute_linear_combination - func: _compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!) @@ -2695,33 +2827,40 @@ # are namespacing issues in C++ - func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA: mean_cpu_gpu - QuantizedCPU: mean_quantized_cpu + CompositeExplicitAutograd: mean - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + structured_delegate: mean.out device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA: mean_cpu_gpu QuantizedCPU: mean_quantized_cpu - func: mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: mean_out_cpu_gpu + CPU, CUDA: mean_out QuantizedCPU: mean_out_quantized_cpu - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method - func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator +- func: nanmean(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + device_check: NoCheck # Composite + variants: function, method + +- func: nanmean.out(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # Composite + - func: median(Tensor self) -> Tensor variants: function, method dispatch: CPU: median_cpu CUDA: median_cuda @@ -2870,22 +3009,22 @@ - func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[]) dispatch: CUDA: miopen_rnn_backward - func: mm(Tensor self, Tensor mat2) -> Tensor + structured_delegate: mm.out variants: function, method dispatch: - CPU: mm_cpu - CUDA: mm_cuda - SparseCPU, SparseCUDA, SparseCsrCPU: _sparse_mm + SparseCPU, SparseCUDA, SparseCsrCPU, SparseCsrCUDA: _sparse_mm - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: - CPU: mm_cpu_out + CPU: mm_out_cpu CUDA: mm_out_cuda SparseCPU, SparseCUDA: _sparse_mm_out - SparseCsrCPU: _sparse_csr_mm_out + SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor dispatch: @@ -2967,16 +3106,20 @@ - func: mv(Tensor self, Tensor vec) -> Tensor variants: function, method dispatch: CPU, CUDA: mv - SparseCPU, SparseCUDA, SparseCsrCPU: mv_sparse + SparseCPU, SparseCUDA, SparseCsrCPU, SparseCsrCUDA: mv_sparse - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!) dispatch: CompositeExplicitAutograd: mv_out +- func: mvlgamma.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: mvlgamma_out + - func: mvlgamma(Tensor self, int p) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: mvlgamma @@ -3150,16 +3293,26 @@ - func: channel_shuffle(Tensor self, int groups) -> Tensor dispatch: CPU: channel_shuffle QuantizedCPU: channel_shuffle_quantized_cpu -- func: is_pinned(Tensor self) -> bool +- func: is_pinned(Tensor self, Device? device=None) -> bool variants: method + dispatch: + CUDA: is_pinned_cuda + CompositeExplicitAutograd: is_pinned_default -- func: pin_memory(Tensor(a) self) -> Tensor(a) +# TODO: add a copy kwarg that guarantees that the tensor is put into fresh +# pinned memory +- func: pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a) variants: method +# Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor +- func: _pin_memory(Tensor self, Device? device=None) -> Tensor + dispatch: + CUDA: _pin_memory_cuda + - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor variants: function, method - func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor variants: function @@ -3324,27 +3477,38 @@ - func: repeat(Tensor self, int[] repeats) -> Tensor variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. dispatch: CompositeExplicitAutograd: repeat -- func: repeat_interleave.Tensor(Tensor repeats) -> Tensor +- func: repeat_interleave.Tensor(Tensor repeats, *, int? output_size=None) -> Tensor variants: function dispatch: CPU: repeat_interleave_cpu CUDA: repeat_interleave_cuda -- func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None) -> Tensor +- func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, int? output_size=None) -> Tensor variants: function, method -- func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor +- func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None, *, int? output_size=None) -> Tensor variants: function, method - func: reshape(Tensor(a) self, int[] shape) -> Tensor(a) variants: function, method device_check: NoCheck device_guard: False +# NOTE [ _reshape_alias ] is meant to be used in the implementation of reshape. +# They are not user-facing, hence the leading underscore. Please don't use it +# anywhere else. +- func: _reshape_alias(Tensor(a) self, int[] size, int[] stride) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA: _reshape_alias + # We don't need to support mkldnn since this is handled explicitly by the reshape operator. + - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor device_check: NoCheck device_guard: False dispatch: MkldnnCPU: mkldnn_reshape @@ -3410,40 +3574,67 @@ variants: function, method dispatch: CPU: prelu_backward_cpu CUDA: prelu_backward_cuda +- func: gelu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + CPU: gelu_out_cpu + CUDA: gelu_out_cuda + - func: gelu(Tensor self) -> Tensor + structured_delegate: gelu.out device_check: NoCheck # TensorIterator python_module: nn dispatch: MkldnnCPU: mkldnn_gelu - CPU: gelu_cpu - CUDA: gelu_cuda +- func: gelu_backward.grad_input(Tensor grad, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU: gelu_backward_out_cpu + CUDA: gelu_backward_out_cuda + - func: gelu_backward(Tensor grad, Tensor self) -> Tensor + structured_delegate: gelu_backward.grad_input python_module: nn dispatch: - CPU: gelu_backward_cpu - CUDA: gelu_backward_cuda + MkldnnCPU: mkldnn_gelu_backward - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor variants: function python_module: nn device_check: NoCheck device_guard: False +- func: hardshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: hardshrink_out + - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor + structured_delegate: hardshrink.out device_check: NoCheck # TensorIterator variants: function, method + +- func: hardshrink_backward.grad_input(Tensor grad_out, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: hardshrink + CPU, CUDA: hardshrink_backward_out - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor + structured_delegate: hardshrink_backward.grad_input variants: function, method - dispatch: - CPU, CUDA: hardshrink_backward - func: rsqrt(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: rsqrt.out variants: function, method @@ -3470,14 +3661,16 @@ device_check: NoCheck device_guard: False dispatch: CompositeExplicitAutograd: select -- func: select_backward(Tensor grad, int[] input_sizes, int dim, int index) -> Tensor +- func: select_backward(Tensor grad_output, int[] input_sizes, int dim, int index) -> Tensor variants: function device_check: NoCheck device_guard: False + dispatch: + CompositeExplicitAutograd: select_backward - func: selu(Tensor self) -> Tensor device_check: NoCheck # TensorIterator - func: selu_(Tensor(a!) self) -> Tensor(a!) @@ -3510,14 +3703,21 @@ structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: silu_out +- func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: nn + dispatch: + CPU, CUDA: silu_backward_out + - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor + structured_delegate: silu_backward.grad_input python_module: nn dispatch: - CPU, CUDA: silu_backward CompositeImplicitAutograd: math_silu_backward - func: mish(Tensor self) -> Tensor structured_delegate: mish.out python_module: nn @@ -3667,14 +3867,16 @@ device_check: NoCheck device_guard: False dispatch: CompositeExplicitAutograd: slice -- func: slice_backward(Tensor grad, int[] input_sizes, int dim, int start, int end, int step) -> Tensor +- func: slice_backward(Tensor grad_output, int[] input_sizes, int dim, int start, int end, int step) -> Tensor variants: function device_check: NoCheck device_guard: False + dispatch: + CompositeExplicitAutograd: slice_backward - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet) variants: function, method dispatch: CompositeExplicitAutograd: slogdet @@ -3688,19 +3890,28 @@ - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor variants: function, method - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor + structured_delegate: _softmax.out dispatch: - CPU: softmax_cpu - CUDA: softmax_cuda MkldnnCPU: mkldnn_softmax +- func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: softmax_cpu_out + CUDA: softmax_cuda_out + - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor + structured_delegate: _softmax_backward_data.out + +- func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True dispatch: - CPU: softmax_backward_cpu - CUDA: softmax_backward_cuda + CPU: softmax_backward_cpu_out + CUDA: softmax_backward_cuda_out - func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[] variants: function, method device_check: NoCheck device_guard: False @@ -3847,23 +4058,23 @@ - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA: sum + CompositeExplicitAutograd: sum - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + structured_delegate: sum.IntList_out device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CPU, CUDA: sum - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: sum_out - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) @@ -3984,16 +4195,16 @@ variants: function, method dispatch: CPU, CUDA: prod - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + structured_delegate: prod.int_out device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CPU, CUDA: prod - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + structured: True device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: prod_out - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor @@ -4134,12 +4345,11 @@ variants: function - func: flip(Tensor self, int[] dims) -> Tensor variants: function, method dispatch: - CPU, QuantizedCPU: flip_cpu - CUDA: flip_cuda + CPU, QuantizedCPU, CUDA, QuantizedCUDA: flip - func: fliplr(Tensor self) -> Tensor variants: function, method - func: flipud(Tensor self) -> Tensor @@ -4156,10 +4366,14 @@ - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: rot90 +- func: trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor + +- func: trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor + - func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor @@ -4474,36 +4688,40 @@ - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA, SparseCPU, SparseCUDA: norm + CompositeExplicitAutograd: norm - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA, SparseCPU, SparseCUDA: norm + CompositeExplicitAutograd: norm - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor + structured_delegate: norm.dtype_out device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA, SparseCPU, SparseCUDA: norm + SparseCPU, SparseCUDA: sparse_dtype_norm - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor + structured_delegate: norm.out device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA, SparseCPU, SparseCUDA: norm + SparseCPU, SparseCUDA: sparse_norm - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) + structured: True device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: norm_out + CPU, CUDA: norm_dtype_out - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: norm_out # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd @@ -4571,11 +4789,11 @@ - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!) use_const_ref_for_mutable_tensors: True variants: function dispatch: SparseCPU, SparseCUDA: resize_as_sparse_ - SparseCsrCPU: resize_as_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_csr_ - func: zero_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method, function dispatch: @@ -4677,18 +4895,19 @@ CPU: addmm_out_cpu CUDA: addmm_out_cuda SparseCPU: addmm_out_sparse_dense_cpu SparseCUDA: addmm_out_sparse_dense_cuda SparseCsrCPU: addmm_out_sparse_csr_dense_cpu + SparseCsrCUDA: addmm_out_sparse_csr_dense_cuda - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor structured_delegate: addmm.out variants: function, method dispatch: SparseCPU: addmm_sparse_dense_cpu SparseCUDA: addmm_sparse_dense_cuda - SparseCsrCPU: addmm_sparse_csr_dense_cpu + SparseCsrCPU, SparseCsrCUDA: addmm_sparse_csr_dense - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) structured_delegate: addmm.out variants: method dispatch: @@ -4806,24 +5025,28 @@ # shared. In other words, their outputs are non-differentiable views of the # sparse tensor. # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given # the default would never make sense. -- func: _sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor -- func: _sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + - func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> () +- func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () + - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor dispatch: SparseCPU, SparseCUDA: new_with_dims_sparse - func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, int[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor @@ -4846,14 +5069,17 @@ variants: method dispatch: SparseCPU: sparse_mask_cpu SparseCUDA: sparse_mask_cuda +- func: _to_cpu(Tensor[] tensors) -> Tensor[] + variants: function + - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor variants: method dispatch: - SparseCPU, SparseCUDA, SparseCsrCPU: sparse_to_dense + SparseCPU, SparseCUDA, SparseCsrCPU, SparseCsrCUDA: sparse_to_dense MkldnnCPU: mkldnn_to_dense - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor - func: sparse_dim(Tensor self) -> int @@ -4888,11 +5114,11 @@ - func: _nnz(Tensor self) -> int variants: method dispatch: SparseCPU, SparseCUDA: _nnz_sparse - SparseCsrCPU: _nnz_sparse_csr + SparseCsrCPU, SparseCsrCUDA: _nnz_sparse_csr device_check: NoCheck device_guard: False # NOTE: [ coalesce autograd ] # coalesce returns self directly for already coalesced sparse tensors. @@ -4947,25 +5173,25 @@ - func: values(Tensor(a) self) -> Tensor(a) variants: method dispatch: SparseCPU, SparseCUDA: values_sparse - SparseCsrCPU: values_sparse_csr + SparseCsrCPU, SparseCsrCUDA: values_sparse_csr device_check: NoCheck device_guard: False - func: crow_indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCsrCPU: crow_indices_sparse_csr + SparseCsrCPU, SparseCsrCUDA: crow_indices_sparse_csr device_check: NoCheck device_guard: False - func: col_indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCsrCPU: col_indices_sparse_csr + SparseCsrCPU, SparseCsrCUDA: col_indices_sparse_csr device_check: NoCheck device_guard: False - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -5023,25 +5249,30 @@ - func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor variants: function dispatch: CPU, CUDA: quantize_per_tensor +- func: quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor + variants: function + dispatch: + CPU, CUDA: quantize_per_tensor_tensor_qparams + - func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[] variants: function dispatch: CPU: quantize_per_tensor_list_cpu - func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor variants: function dispatch: - CPU: quantize_per_channel_cpu + CPU, CUDA: quantize_per_channel - func: dequantize.self(Tensor self) -> Tensor variants: function, method dispatch: CPU: dequantize_cpu - QuantizedCPU, QuantizedCUDA: dequantize_quantized_cpu + QuantizedCPU, QuantizedCUDA: dequantize_quantized - func: dequantize.tensors(Tensor[] tensors) -> Tensor[] variants: function dispatch: QuantizedCPU: dequantize_tensors_quantized_cpu @@ -5084,25 +5315,35 @@ CUDA: make_per_tensor_quantized_tensor_cuda - func: _make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor dispatch: CPU: make_per_channel_quantized_tensor_cpu + CUDA: make_per_channel_quantized_tensor_cuda - func: qscheme(Tensor self) -> QScheme variants: method dispatch: QuantizedCPU, QuantizedCUDA: qscheme_quant - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor device_check: NoCheck # TensorIterator variants: function +- func: fake_quantize_per_tensor_affine.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + - func: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask) variants: function dispatch: CPU, CUDA: fake_quantize_per_tensor_affine_cachemask +- func: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max) -> (Tensor output, Tensor mask) + variants: function + dispatch: + CPU, CUDA: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams + - func: fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor variants: function - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor variants: function @@ -5130,44 +5371,64 @@ CPU, CUDA: _fake_quantize_learnable_per_channel_affine - func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor) variants: function +- func: fused_moving_avg_obs_fake_quant(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> Tensor + variants: function + +- func: _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask) + dispatch: + CPU: fused_moving_avg_obs_fake_quant_cpu + CUDA: fused_moving_avg_obs_fake_quant_cuda + + - func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int) variants: function - func: _saturate_weight_to_fp16(Tensor weight) -> Tensor variants: function - func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor) variants: function +- func: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: _to_copy + # to(Device) must not exist because all constructors of Device also works for # TensorOptions. Otherwise, an ambiguity error is thrown. # See NOTE [ TensorOptions Constructors ]. -- func: to.dtype_layout(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor +- func: to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) variants: method device_check: NoCheck device_guard: False -- func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor +- func: to.device(Tensor(a) self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) variants: method device_check: NoCheck device_guard: False -- func: to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor +- func: to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) variants: method device_check: NoCheck device_guard: False -- func: to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor +- func: to.other(Tensor(a) self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a) variants: method device_check: NoCheck device_guard: False - func: meshgrid(Tensor[] tensors) -> Tensor[] +# TODO: Two weeks after this lands, combine these two overloads, +# making "indexing" optional. These are temporarily distinct for +# forward-compatibility reasons. +- func: meshgrid.indexing(Tensor[] tensors, *, str indexing) -> Tensor[] + - func: cartesian_prod(Tensor[] tensors) -> Tensor variants: function - func: combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor variants: function @@ -5431,92 +5692,136 @@ - func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor device_check: NoCheck # TensorIterator variants: function, method +- func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor + structured_delegate: scatter.src_out + variants: function, method + - func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) + structured_delegate: scatter.src_out variants: method + +- func: scatter.src_out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function dispatch: - CPU, CUDA: scatter_ + CPU, CUDA: scatter_src_out -- func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor +- func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor + structured_delegate: scatter.value_out variants: function, method - func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) + structured_delegate: scatter.value_out variants: method + +- func: scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function dispatch: - CPU, CUDA: scatter_fill_ + CPU, CUDA: scatter_value_out -- func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor +- func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor + structured_delegate: scatter.reduce_out variants: function, method -- func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor - variants: function, method - -- func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor - variants: function, method - - func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!) + structured_delegate: scatter.reduce_out variants: method + +- func: scatter.reduce_out(Tensor self, int dim, Tensor index, Tensor src, *, str reduce, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function dispatch: - CPU, CUDA: scatter_reduce_ + CPU, CUDA: scatter_reduce_out +- func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor + structured_delegate: scatter.value_reduce_out + variants: function, method + - func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!) + structured_delegate: scatter.value_reduce_out variants: method - dispatch: - CPU, CUDA: scatter_scalar_reduce_ -- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) - variants: method +- func: scatter.value_reduce_out(Tensor self, int dim, Tensor index, Scalar value, *, str reduce, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function dispatch: - CPU, CUDA: scatter_add_ + CPU, CUDA: scatter_value_reduce_out +- func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor + variants: function, method + +- func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor + variants: function, method + - func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor + structured_delegate: scatter_add.out variants: function, method +- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) + structured_delegate: scatter_add.out + variants: method + +- func: scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + dispatch: + CPU, CUDA: scatter_add + - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor variants: function, method - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: eq.Scalar_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: eq_ - func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: eq.Tensor_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: eq_ - func: bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase variants: function dispatch: CPU, CUDA: bitwise_and_out - func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_and_out + CompositeExplicitAutograd: bitwise_and_out - func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_and - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function + structured_delegate: bitwise_and.Tensor_out - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method + structured_delegate: bitwise_and.Tensor_out - func: __and__.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -5532,35 +5837,39 @@ device_check: NoCheck # TensorIterator variants: method - func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase variants: function dispatch: CPU, CUDA: bitwise_or_out - func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_or_out + CompositeExplicitAutograd: bitwise_or_out - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function + structured_delegate: bitwise_or.Tensor_out - func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method - func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method + structured_delegate: bitwise_or.Tensor_out - func: __or__.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -5576,35 +5885,39 @@ device_check: NoCheck # TensorIterator variants: method - func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase variants: function dispatch: CPU, CUDA: bitwise_xor_out - func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_xor_out + CompositeExplicitAutograd: bitwise_xor_out - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function + structured_delegate: bitwise_xor.Tensor_out - func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method - func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method + structured_delegate: bitwise_xor.Tensor_out - func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -5642,10 +5955,51 @@ device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: __ilshift__ +- func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: bitwise_left_shift.Tensor_out + +- func: bitwise_left_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_left_shift.Tensor_out + +- func: bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: bitwise_left_shift_out + +- func: bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: bitwise_left_shift + +- func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: bitwise_left_shift_ + +- func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA: bitwise_left_shift_out + +- func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA: bitwise_left_shift + - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: CPU, CUDA: __rshift__ @@ -5666,10 +6020,51 @@ device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: __irshift__ +- func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function, method + structured_delegate: bitwise_right_shift.Tensor_out + +- func: bitwise_right_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: bitwise_right_shift.Tensor_out + +- func: bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: bitwise_right_shift_out + +- func: bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + variants: method, function + dispatch: + CPU, CUDA: bitwise_right_shift + +- func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CPU, CUDA: bitwise_right_shift_ + +- func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA: bitwise_right_shift_out + +- func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CPU, CUDA: bitwise_right_shift + - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) variants: method dispatch: CPU: tril_cpu_ CUDA: tril_cuda_ @@ -5683,17 +6078,10 @@ - func: digamma_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: digamma.out variants: method -- func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!) - device_check: NoCheck # TensorIterator - variants: method - dispatch: - CPU: legacy::cpu::_th_renorm_ - CUDA: legacy::cuda::_th_renorm_ - - func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU: lerp_cpu_scalar_ @@ -5704,34 +6092,10 @@ variants: method dispatch: CPU: lerp_cpu_tensor_ CUDA: lerp_cuda_tensor_ -- func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - device_check: NoCheck # TensorIterator - variants: method - dispatch: - CPU, CUDA: fmod_ - -- func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - device_check: NoCheck # TensorIterator - variants: method - dispatch: - CPU, CUDA: fmod_ - -- func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - device_check: NoCheck # TensorIterator - variants: method - dispatch: - CPU, CUDA: remainder_ - -- func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - device_check: NoCheck # TensorIterator - variants: method - dispatch: - CPU, CUDA: remainder_ - - func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) variants: method dispatch: CPU, CUDA: addbmm_ @@ -5742,16 +6106,10 @@ - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor variants: method, function dispatch: CPU, CUDA: addbmm -- func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) - device_check: NoCheck # TensorIterator - variants: method - dispatch: - CompositeExplicitAutograd: addcdiv_ - - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: random_ @@ -5868,42 +6226,48 @@ variants: function device_check: NoCheck device_guard: False - func: ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: ne_out + CPU, CUDA: ne_Scalar_out QuantizedCPU: ne_out_quantized_cpu - func: ne.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: ne.Scalar_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: ne QuantizedCPU: ne_quantized_cpu - func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: ne_out + CPU, CUDA: ne_Tensor_out QuantizedCPU: ne_out_quantized_cpu - func: ne.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: ne.Tensor_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: ne QuantizedCPU: ne_quantized_cpu - func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: ne.Scalar_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: ne_ - func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: ne.Tensor_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: ne_ @@ -5923,68 +6287,78 @@ - func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: eq_out + CPU, CUDA: eq_Scalar_out QuantizedCPU: eq_out_quantized_cpu - func: eq.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: eq.Scalar_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: eq QuantizedCPU: eq_quantized_cpu - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: eq_out + CPU, CUDA: eq_Tensor_out QuantizedCPU: eq_out_quantized_cpu - func: eq.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: eq.Tensor_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: eq QuantizedCPU: eq_quantized_cpu - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: ge_out + CPU, CUDA: ge_Scalar_out QuantizedCPU: ge_out_quantized_cpu - func: ge.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: ge.Scalar_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: ge QuantizedCPU: ge_quantized_cpu - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: ge_out + CPU, CUDA: ge_Tensor_out QuantizedCPU: ge_out_quantized_cpu - func: ge.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: ge.Tensor_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: ge QuantizedCPU: ge_quantized_cpu - func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: ge.Scalar_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: ge_ - func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: ge.Tensor_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: ge_ @@ -6004,42 +6378,48 @@ - func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: le_out + CPU, CUDA: le_Scalar_out QuantizedCPU: le_out_quantized_cpu - func: le.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: le.Scalar_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: le QuantizedCPU: le_quantized_cpu - func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: le_out + CPU, CUDA: le_Tensor_out QuantizedCPU: le_out_quantized_cpu - func: le.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: le.Tensor_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: le QuantizedCPU: le_quantized_cpu - func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: le.Scalar_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: le_ - func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: le.Tensor_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: le_ @@ -6059,42 +6439,48 @@ - func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: gt_out + CPU, CUDA: gt_Scalar_out QuantizedCPU: gt_out_quantized_cpu - func: gt.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: gt.Scalar_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: gt QuantizedCPU: gt_quantized_cpu - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: gt_out + CPU, CUDA: gt_Tensor_out QuantizedCPU: gt_out_quantized_cpu - func: gt.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: gt.Tensor_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: gt QuantizedCPU: gt_quantized_cpu - func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: gt.Scalar_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: gt_ - func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: gt.Tensor_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: gt_ @@ -6114,42 +6500,48 @@ - func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: lt_out + CPU, CUDA: lt_Scalar_out QuantizedCPU: lt_out_quantized_cpu - func: lt.Scalar(Tensor self, Scalar other) -> Tensor + structured_delegate: lt.Scalar_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: lt QuantizedCPU: lt_quantized_cpu - func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: lt_out + CPU, CUDA: lt_Tensor_out QuantizedCPU: lt_out_quantized_cpu - func: lt.Tensor(Tensor self, Tensor other) -> Tensor + structured_delegate: lt.Tensor_out device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: lt QuantizedCPU: lt_quantized_cpu - func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + structured_delegate: lt.Scalar_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: lt_ - func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + structured_delegate: lt.Tensor_out device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: lt_ @@ -6184,18 +6576,18 @@ - func: take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor variants: method, function - func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: index_select_out_cpu_ - CUDA: index_select_out_cuda + CPU, QuantizedCPU: index_select_out_cpu_ + CUDA, QuantizedCUDA: index_select_out_cuda - func: index_select(Tensor self, int dim, Tensor index) -> Tensor variants: method, function dispatch: - CPU: index_select_cpu_ - CUDA: index_select_cuda + CPU, QuantizedCPU: index_select_cpu_ + CUDA, QuantizedCUDA: index_select_cuda SparseCPU: index_select_sparse SparseCUDA: index_select_sparse - func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) @@ -6223,31 +6615,30 @@ device_check: NoCheck device_guard: False - func: nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: legacy::cpu::_th_nonzero_out + CPU: nonzero_out_cpu CUDA: nonzero_out_cuda - func: nonzero(Tensor self) -> Tensor variants: method, function dispatch: - CPU: legacy::cpu::_th_nonzero + CPU: nonzero_cpu CUDA: nonzero_cuda - func: nonzero_numpy(Tensor self) -> Tensor[] variants: method, function - func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: - CPU: gather_out_cpu_cuda - CUDA: gather_out_cpu_cuda + CPU, CUDA: gather_out - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor variants: method, function - dispatch: - CPU, CUDA: gather + structured_delegate: gather.out - func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor variants: function device_check: NoCheck device_guard: False @@ -6258,50 +6649,56 @@ variants: method, function - func: _gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: addcmul_out - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor + structured_delegate: addcmul.out device_check: NoCheck # TensorIterator variants: method, function - dispatch: - CompositeExplicitAutograd: addcmul - func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) + structured_delegate: addcmul.out device_check: NoCheck # TensorIterator variants: method - dispatch: - CompositeExplicitAutograd: addcmul_ - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: addcdiv_out - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor + structured_delegate: addcdiv.out device_check: NoCheck # TensorIterator variants: method, function - dispatch: - CompositeExplicitAutograd: addcdiv -- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor +- func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) + structured_delegate: addcdiv.out + device_check: NoCheck # TensorIterator + variants: method + +- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, float label_smoothing=0.0) -> Tensor python_module: nn - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR) dispatch: - CPU: legacy::cpu::_th_gels_out - CUDA: legacy::cuda::_th_gels_out + CPU: legacy_lstsq_out + CUDA: legacy_lstsq_out_cuda - func: lstsq(Tensor self, Tensor A) -> (Tensor solution, Tensor QR) variants: method, function dispatch: - CPU: legacy::cpu::_th_gels - CUDA: legacy::cuda::_th_gels + CPU: legacy_lstsq + CUDA: legacy_lstsq_cuda - func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient) dispatch: CPU, CUDA: triangular_solve_out @@ -6442,23 +6839,23 @@ - func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor variants: method, function dispatch: CPU, CUDA: ormqr -- func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor, Tensor, Tensor) +- func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor LU, Tensor pivots, Tensor info) variants: function dispatch: CPU, CUDA: _lu_with_info - func: lu_solve.out(Tensor self, Tensor LU_data, Tensor LU_pivots, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CompositeExplicitAutograd: lu_solve_out + CPU, CUDA: lu_solve_out - func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor variants: method, function dispatch: - CompositeExplicitAutograd: lu_solve + CPU, CUDA: lu_solve - func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U) variants: function dispatch: CPU, CUDA: lu_unpack @@ -6577,12 +6974,15 @@ dispatch: CPU, CUDA: sign_out - func: signbit(Tensor self) -> Tensor variants: function, method + structured_delegate: signbit.out - func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU: signbit_out CUDA: signbit_out - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor @@ -6634,41 +7034,72 @@ CPU: lerp_cpu_tensor CUDA: lerp_cuda_tensor - func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: legacy::cpu::_th_histc_out + CPU: histogram_histc_cpu_out CUDA: _histc_out_cuda - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor variants: method, function dispatch: - CPU: legacy::cpu::_th_histc + CPU: histogram_histc_cpu CUDA: _histc_cuda +- func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges) + dispatch: + CPU: histogram_out_cpu + +- func: histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges) + variants: method, function + dispatch: + CPU: histogram_cpu + +- func: histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges) + dispatch: + CPU: histogram_out_cpu + +- func: histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges) + variants: method, function + dispatch: + CPU: histogram_cpu + - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: fmod_out + CompositeExplicitAutograd: fmod_out - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: fmod + CompositeExplicitAutograd: fmod +- func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + dispatch: + CompositeExplicitAutograd: fmod_ + - func: fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: fmod_out - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator + structured_delegate: fmod.Tensor_out variants: method, function - dispatch: - CPU, CUDA: fmod + +- func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: fmod.Tensor_out + - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: hypot_out @@ -6726,28 +7157,43 @@ variants: method dispatch: CompositeExplicitAutograd: nextafter_ - func: remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) - device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: remainder_out + CompositeExplicitAutograd: remainder_out - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor - device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: remainder + CompositeExplicitAutograd: remainder +- func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + variants: method + dispatch: + CompositeExplicitAutograd: remainder_ + - func: remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: remainder_out - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator + structured_delegate: remainder.Tensor_out variants: method, function + +- func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured_delegate: remainder.Tensor_out + variants: method + +- func: remainder.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function dispatch: CPU, CUDA: remainder - func: min(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -6755,15 +7201,18 @@ dispatch: CPU, CUDA: min QuantizedCPU: min_quantized_cpu - func: fmin(Tensor self, Tensor other) -> Tensor + structured_delegate: fmin.out + device_check: NoCheck # TensorIterator variants: method, function - dispatch: - CPU, CUDA: fmin - func: fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: fmin_out - func: max(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -6771,15 +7220,18 @@ dispatch: CPU, CUDA: max QuantizedCPU: max_quantized_cpu - func: fmax(Tensor self, Tensor other) -> Tensor + structured_delegate: fmax.out + device_check: NoCheck # TensorIterator variants: method, function - dispatch: - CPU, CUDA: fmax - func: fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: fmax_out - func: maximum(Tensor self, Tensor other) -> Tensor structured_delegate: maximum.out @@ -6926,34 +7378,48 @@ dispatch: QuantizedCPU: topk_quantized_cpu - func: all(Tensor self) -> Tensor device_check: NoCheck # TensorIterator + structured_delegate: all.all_out variants: method, function + +- func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + structured: True dispatch: - CPU, CUDA: all + CPU, CUDA: all_all_out - func: any(Tensor self) -> Tensor device_check: NoCheck # TensorIterator + structured_delegate: any.all_out variants: method, function dispatch: - CPU, CUDA: any SparseCPU, SparseCUDA: any_sparse +- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck + structured: True + dispatch: + CPU, CUDA: any_all_out + - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True dispatch: - CPU: legacy::cpu::_th_renorm_out - CUDA: legacy::cuda::_th_renorm_out + CPU, CUDA: renorm_out - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor device_check: NoCheck # TensorIterator variants: method, function - dispatch: - CPU: legacy::cpu::_th_renorm - CUDA: legacy::cuda::_th_renorm + structured_delegate: renorm.out +- func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: method + structured_delegate: renorm.out + - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a) variants: method device_check: NoCheck device_guard: False dispatch: @@ -7082,30 +7548,10 @@ - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) dispatch: CPU: _index_copy_impl_ CUDA: _index_copy_impl_ -- func: _cumsum(Tensor self, int dim) -> Tensor - dispatch: - CPU: _cumsum_cpu - CUDA: _cumsum_cuda - -- func: _cumsum.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU: _cumsum_out_cpu - CUDA: _cumsum_out_cuda - -- func: _cumprod(Tensor self, int dim) -> Tensor - dispatch: - CPU: _cumprod_cpu - CUDA: _cumprod_cuda - -- func: _cumprod.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU: _cumprod_out_cpu - CUDA: _cumprod_out_cuda - - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> () variants: function dispatch: CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_ @@ -7791,10 +8237,19 @@ - func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False) -> Tensor dispatch: CPU: searchsorted_cpu CUDA: searchsorted_cuda +- func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor + structured_delegate: _convert_indices_from_coo_to_csr.out + +- func: _convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: _convert_indices_from_coo_to_csr_structured_cpu + CUDA: _convert_indices_from_coo_to_csr_structured_cuda + ## NN wrappers - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator python_module: nn @@ -7839,29 +8294,29 @@ - func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: multi_margin_loss_cpu_out - CUDA: legacy::cuda::_thnn_multi_margin_loss_forward_out + CUDA: multi_margin_loss_cuda_out - func: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor python_module: nn dispatch: CPU: multi_margin_loss_cpu - CUDA: legacy::cuda::_thnn_multi_margin_loss_forward + CUDA: multi_margin_loss_cuda - func: multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: multi_margin_loss_cpu_backward_out - CUDA: legacy::cuda::_thnn_multi_margin_loss_backward_out + CUDA: multi_margin_loss_cuda_backward_out - func: multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor python_module: nn dispatch: CPU: multi_margin_loss_cpu_backward - CUDA: legacy::cuda::_thnn_multi_margin_loss_backward + CUDA: multi_margin_loss_cuda_backward - func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn - func: multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor @@ -7869,29 +8324,29 @@ - func: multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!)) python_module: nn dispatch: CPU: multilabel_margin_loss_forward_out_cpu - CUDA: legacy::cuda::_thnn_multilabel_margin_loss_forward_out + CUDA: multilabel_margin_loss_forward_out_cuda - func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target) python_module: nn dispatch: CPU: multilabel_margin_loss_forward_cpu - CUDA: legacy::cuda::_thnn_multilabel_margin_loss_forward + CUDA: multilabel_margin_loss_forward_cuda - func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: multilabel_margin_loss_backward_cpu_out - CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward_out + CUDA: multilabel_margin_loss_backward_cuda_out - func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor python_module: nn dispatch: CPU: multilabel_margin_loss_backward_cpu - CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward + CUDA: multilabel_margin_loss_backward_cuda - func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!) python_module: nn - func: nll_loss_nd(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor @@ -7900,31 +8355,29 @@ - func: nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor python_module: nn - func: nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!)) python_module: nn + structured: True dispatch: CPU: nll_loss_forward_out_cpu - CUDA: legacy::cuda::_thnn_nll_loss_forward_out + CUDA: nll_loss_forward_out_cuda - func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight) python_module: nn - dispatch: - CPU: nll_loss_forward_cpu - CUDA: legacy::cuda::_thnn_nll_loss_forward + structured_delegate: nll_loss_forward.output - func: nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn + structured: True dispatch: CPU: nll_loss_backward_out_cpu - CUDA: legacy::cuda::_thnn_nll_loss_backward_out + CUDA: nll_loss_backward_out_cuda - func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor python_module: nn - dispatch: - CPU: nll_loss_backward_cpu - CUDA: legacy::cuda::_thnn_nll_loss_backward + structured_delegate: nll_loss_backward.grad_input - func: nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!) python_module: nn - func: nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor @@ -7932,29 +8385,29 @@ - func: nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!)) python_module: nn dispatch: CPU: nll_loss2d_forward_out_cpu - CUDA: legacy::cuda::_thnn_nll_loss2d_forward_out + CUDA: nll_loss2d_forward_out_cuda - func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight) python_module: nn dispatch: CPU: nll_loss2d_forward_cpu - CUDA: legacy::cuda::_thnn_nll_loss2d_forward + CUDA: nll_loss2d_forward_cuda - func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: nll_loss2d_backward_out_cpu - CUDA: legacy::cuda::_thnn_nll_loss2d_backward_out + CUDA: nll_loss2d_backward_out_cuda - func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor python_module: nn dispatch: CPU: nll_loss2d_backward_cpu - CUDA: legacy::cuda::_thnn_nll_loss2d_backward + CUDA: nll_loss2d_backward_cuda - func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator python_module: nn dispatch: @@ -8029,45 +8482,51 @@ - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor structured_delegate: elu.out device_check: NoCheck # TensorIterator python_module: nn -- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor +- func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase python_module: nn dispatch: - CPU, CUDA: elu_backward + CPU, CUDA: elu_backward_out +- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor + structured_delegate: elu_backward.grad_input + python_module: nn + - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) structured_delegate: elu.out device_check: NoCheck # TensorIterator python_module: nn dispatch: CompositeExplicitAutograd: elu_ - func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase python_module: nn dispatch: - CPU: glu_out - CUDA: legacy::cuda::_thnn_glu_forward_out + CPU, CUDA: glu_out - func: glu(Tensor self, int dim=-1) -> Tensor + structured_delegate: glu.out + device_check: NoCheck # TensorIterator python_module: nn - dispatch: - CPU: glu - CUDA: legacy::cuda::_thnn_glu_forward - func: glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: - CPU: glu_backward_out - CUDA: legacy::cuda::_thnn_glu_backward_out + CPU: glu_backward_cpu_out + CUDA: glu_backward_cuda_out - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor python_module: nn dispatch: - CPU: glu_backward - CUDA: legacy::cuda::_thnn_glu_backward + CPU: glu_backward_cpu + CUDA: glu_backward_cuda - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator @@ -8085,15 +8544,21 @@ - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!) structured_delegate: hardsigmoid.out device_check: NoCheck # TensorIterator python_module: nn -- func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor +- func: hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase python_module: nn dispatch: - CPU, CUDA: hardsigmoid_backward + CPU, CUDA: hardsigmoid_backward_out +- func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor + structured_delegate: hardsigmoid_backward.grad_input + python_module: nn + - func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator python_module: nn dispatch: CPU, CUDA: hardtanh_out @@ -8160,15 +8625,21 @@ device_check: NoCheck # TensorIterator python_module: nn dispatch: QuantizedCPU: leaky_relu_quantized_cpu -- func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor +- func: leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase python_module: nn dispatch: - CPU, CUDA: leaky_relu_backward + CPU, CUDA: leaky_relu_backward_out +- func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor + structured_delegate: leaky_relu_backward.grad_input + python_module: nn + - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) structured_delegate: leaky_relu.out device_check: NoCheck # TensorIterator python_module: nn dispatch: @@ -8185,53 +8656,53 @@ - func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!)) device_check: NoCheck # TensorIterator python_module: nn dispatch: CPU: log_sigmoid_forward_out_cpu - CUDA: legacy::cuda::_thnn_log_sigmoid_forward_out + CUDA: log_sigmoid_forward_out_cuda - func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer) device_check: NoCheck # TensorIterator python_module: nn dispatch: CPU: log_sigmoid_forward_cpu - CUDA: legacy::cuda::_thnn_log_sigmoid_forward + CUDA: log_sigmoid_forward_cuda - func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: - CPU: log_sigmoid_backward_out_cpu - CUDA: legacy::cuda::_thnn_log_sigmoid_backward_out + CPU: log_sigmoid_backward_cpu_out + CUDA: log_sigmoid_backward_cuda_out - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor python_module: nn dispatch: CPU: log_sigmoid_backward_cpu - CUDA: legacy::cuda::_thnn_log_sigmoid_backward + CUDA: log_sigmoid_backward_cuda - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: rrelu_with_noise_out_cpu - CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward_out + CUDA: rrelu_with_noise_out_cuda - func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor python_module: nn dispatch: CPU: rrelu_with_noise_cpu - CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward + CUDA: rrelu_with_noise_cuda - func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor python_module: nn dispatch: CompositeExplicitAutograd: rrelu_with_noise_backward - func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) python_module: nn dispatch: CPU: rrelu_with_noise_cpu_ - CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward_ + CUDA: rrelu_with_noise_cuda_ - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator @@ -8243,18 +8714,19 @@ structured_delegate: softplus.out device_check: NoCheck # TensorIterator python_module: nn - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: softplus_backward_out - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor + structured_delegate: softplus_backward.grad_input python_module: nn - dispatch: - CPU, CUDA: softplus_backward - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator @@ -8266,23 +8738,25 @@ structured_delegate: softshrink.out device_check: NoCheck # TensorIterator python_module: nn - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: softshrink_backward_out - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor + structured_delegate: softshrink_backward.grad_input python_module: nn - dispatch: - CPU, CUDA: softshrink_backward - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: - CPU, CUDA: adaptive_avg_pool2d_out_cpu + CPU: adaptive_avg_pool2d_out_cpu + CUDA: adaptive_avg_pool2d_out_cuda MkldnnCPU: mkldnn_adaptive_avg_pool2d_out - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor python_module: nn @@ -8382,64 +8856,68 @@ python_module: nn structured_delegate: adaptive_max_pool3d_backward.grad_input - func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + structured: True + precomputed: + - kernel_size -> int kH, int kW + - stride -> int dH, int dW + - padding -> int padH, int padW dispatch: CPU: avg_pool2d_out_cpu CUDA: avg_pool2d_out_cuda MkldnnCPU: mkldnn_avg_pool2d_out - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor python_module: nn + structured_delegate: avg_pool2d.out dispatch: - CPU: avg_pool2d_cpu - CUDA: avg_pool2d_cuda MkldnnCPU: mkldnn_avg_pool2d QuantizedCPU: avg_pool2d_quantized_cpu - func: avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn + structured: True dispatch: CPU: avg_pool2d_backward_out_cpu CUDA: avg_pool2d_backward_out_cuda MkldnnCPU: mkldnn_avg_pool2d_backward_out - func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor python_module: nn + structured_delegate: avg_pool2d_backward.grad_input dispatch: - CPU: avg_pool2d_backward_cpu - CUDA: avg_pool2d_backward_cuda MkldnnCPU: mkldnn_avg_pool2d_backward - func: avg_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + structured: True dispatch: CPU: avg_pool3d_out_cpu CUDA: avg_pool3d_out_cuda MkldnnCPU: mkldnn_avg_pool3d_out - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor python_module: nn + structured_delegate: avg_pool3d.out dispatch: - CPU: avg_pool3d_cpu - CUDA: avg_pool3d_cuda MkldnnCPU: mkldnn_avg_pool3d QuantizedCPU: avg_pool3d_quantized_cpu - func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn + structured: True dispatch: CPU: avg_pool3d_backward_out_cpu CUDA: avg_pool3d_backward_out_cuda MkldnnCPU: mkldnn_avg_pool3d_backward_out - func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor python_module: nn + structured_delegate: avg_pool3d_backward.grad_input dispatch: - CPU: avg_pool3d_backward_cpu - CUDA: avg_pool3d_backward_cuda MkldnnCPU: mkldnn_avg_pool3d_backward # Return: (Tensor output, Tensor indices) - func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) python_module: nn @@ -8602,19 +9080,18 @@ dispatch: QuantizedCPU: reflection_pad1d_cpu - func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, int[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn + structured: True dispatch: CPU: reflection_pad1d_backward_out_cpu CUDA: reflection_pad1d_backward_out_cuda - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor python_module: nn - dispatch: - CPU: reflection_pad1d_backward_cpu - CUDA: reflection_pad1d_backward_cuda + structured_delegate: reflection_pad1d_backward.grad_input - func: reflection_pad2d.out(Tensor self, int[4] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU, QuantizedCPU: reflection_pad2d_out_cpu @@ -8636,10 +9113,32 @@ python_module: nn dispatch: CPU: reflection_pad2d_backward_cpu CUDA: reflection_pad2d_backward_cuda +- func: reflection_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: reflection_pad3d_out_cpu + CUDA: reflection_pad3d_out_cuda + +- func: reflection_pad3d(Tensor self, int[6] padding) -> Tensor + python_module: nn + structured_delegate: reflection_pad3d.out + +- func: reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, int[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + structured: True + dispatch: + CPU: reflection_pad3d_backward_out_cpu + CUDA: reflection_pad3d_backward_out_cuda + +- func: reflection_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor + python_module: nn + structured_delegate: reflection_pad3d_backward.grad_input + - func: replication_pad1d.out(Tensor self, int[2] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn structured: True dispatch: CPU: replication_pad1d_out_cpu @@ -8940,37 +9439,40 @@ python_module: nn structured_delegate: upsample_nearest3d_backward.grad_input - func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sigmoid_backward_out - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor python_module: nn - dispatch: - CPU, CUDA: sigmoid_backward + structured_delegate: sigmoid_backward.grad_input - func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: logit_backward_out - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor python_module: nn - dispatch: - CPU, CUDA: logit_backward + structured_delegate: logit_backward.grad_input - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: tanh_backward_out - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor python_module: nn - dispatch: - CPU, CUDA: tanh_backward + structured_delegate: tanh_backward.grad_input # What's a thnn_conv_ versus a slow_conv_? # # Historically, we have inefficient implementations of convolutions # coming from the THNN/THCUNN library. These convolutions typically @@ -8988,19 +9490,18 @@ # these are the same thing, but we give them different prefixes to # make the operational distinction clear. - func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + structured: True dispatch: - CPU: slow_conv_transpose2d_out_cpu - CUDA: slow_conv_transpose2d_out_cuda + CPU: slow_conv_transpose2d_structured_cpu + CUDA: slow_conv_transpose2d_structured_cuda - func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor python_module: nn - dispatch: - CPU: slow_conv_transpose2d_cpu - CUDA: slow_conv_transpose2d_cuda + structured_delegate: slow_conv_transpose2d.out - func: slow_conv_transpose2d_backward.grad_output(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, Tensor columns, Tensor ones, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) python_module: nn dispatch: CPU: slow_conv_transpose2d_backward_out_cpu @@ -9044,17 +9545,17 @@ - func: thnn_conv2d_forward.output(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, *, Tensor(a!) output, Tensor(b!) finput, Tensor(c!) fgrad_input) -> (Tensor(a!), Tensor(b!), Tensor(c!)) python_module: nn dispatch: CPU: slow_conv2d_forward_out_cpu - CUDA: legacy::cuda::_thnn_conv2d_forward_out + CUDA: slow_conv2d_forward_out_cuda - func: thnn_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> (Tensor output, Tensor finput, Tensor fgrad_input) python_module: nn dispatch: CPU: slow_conv2d_forward_cpu - CUDA: legacy::cuda::_thnn_conv2d_forward + CUDA: slow_conv2d_forward_cuda - func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) python_module: nn dispatch: CPU: slow_conv2d_backward_out_cpu @@ -9064,35 +9565,30 @@ python_module: nn dispatch: CPU: slow_conv2d_backward_cpu CUDA: slow_conv2d_backward_cuda -- func: thnn_conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!) +- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True python_module: nn - -- func: thnn_conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor - python_module: nn - -- func: thnn_conv_depthwise2d_forward.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!) - python_module: nn dispatch: - CUDA: legacy::cuda::_thnn_conv_depthwise2d_forward_out + CUDA: conv_depthwise2d_cuda_out -- func: thnn_conv_depthwise2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor +- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor python_module: nn dispatch: - CUDA: legacy::cuda::_thnn_conv_depthwise2d_forward + CUDA: conv_depthwise2d_cuda -- func: thnn_conv_depthwise2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) grad_input, Tensor(b!) grad_weight) -> (Tensor(a!), Tensor(b!)) +- func: _conv_depthwise2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) grad_input, Tensor(b!) grad_weight) -> (Tensor(a!), Tensor(b!)) python_module: nn dispatch: - CUDA: thnn_conv_depthwise2d_backward_out + CUDA: conv_depthwise2d_backward_cuda_out -- func: thnn_conv_depthwise2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[2] output_mask) -> (Tensor grad_input, Tensor grad_weight) +- func: _conv_depthwise2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[2] output_mask) -> (Tensor grad_input, Tensor grad_weight) python_module: nn dispatch: - CUDA: thnn_conv_depthwise2d_backward + CUDA: conv_depthwise2d_backward_cuda - func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, int[3] dilation) -> Tensor python_module: nn dispatch: CUDA: conv_depthwise3d_cuda @@ -9224,19 +9720,25 @@ dispatch: CUDA: record_stream_cuda - func: isposinf(Tensor self) -> Tensor variants: function, method + structured_delegate: isposinf.out - func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: isposinf_out - func: isneginf(Tensor self) -> Tensor variants: function, method + structured_delegate: isneginf.out - func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: isneginf_out # NOTE [_add_batch_dim and _remove_batch_dim] # _add_batch_dim and _remove_batch_dim are meant to be used in the implementation @@ -9267,10 +9769,23 @@ python_module: special variants: function dispatch: CPU, CUDA: special_entr_out +- func: special_ndtri(Tensor self) -> Tensor + structured_delegate: special_ndtri.out + python_module: special + variants: function + +- func: special_ndtri.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_ndtri_out + - func: special_expm1(Tensor self) -> Tensor python_module: special variants: function - func: special_expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -9283,10 +9798,26 @@ - func: special_exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: special variants: function +- func: special_psi(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_digamma(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + - func: special_gammaln(Tensor self) -> Tensor python_module: special variants: function - func: special_gammaln.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -9306,17 +9837,37 @@ variants: function - func: special_erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: special +- func: special_erfcx(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_erfcx.out + +- func: special_erfcx.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_erfcx_out + - func: special_erfinv(Tensor self) -> Tensor python_module: special variants: function - func: special_erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: special +- func: special_ndtr(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + - func: special_xlog1py(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator python_module: special variants: function structured_delegate: special_xlog1py.out @@ -9356,10 +9907,93 @@ python_module: special variants: function dispatch: CompositeExplicitAutograd: special_xlog1py_out +- func: special_xlogy(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.self_scalar(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.other_scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_xlogy.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + +- func: special_zeta(Tensor self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + structured_delegate: special_zeta.out + dispatch: + CompositeExplicitAutograd: special_zeta + +- func: special_zeta.self_scalar(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta + +- func: special_zeta.other_scalar(Tensor self, Scalar other) -> Tensor + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta + +- func: special_zeta.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_zeta_out + +- func: special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta_out + +- func: special_zeta.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + python_module: special + variants: function + dispatch: + CompositeExplicitAutograd: special_zeta_out + +- func: special_i0(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + - func: special_i0e(Tensor self) -> Tensor python_module: special variants: function structured_delegate: special_i0e.out @@ -9368,25 +10002,115 @@ structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: special_i0e_out +- func: special_i1(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_i1.out + +- func: special_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_i1_out + +- func: special_i1e(Tensor self) -> Tensor + python_module: special + variants: function + structured_delegate: special_i1e.out + +- func: special_i1e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + structured: True + structured_inherits: TensorIteratorBase + dispatch: + CPU, CUDA: special_i1e_out + - func: special_logit(Tensor self, float? eps=None) -> Tensor python_module: special variants: function - func: special_logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) python_module: special +- func: special_polygamma(int n, Tensor self) -> Tensor + python_module: special + variants: function, method + +- func: special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + +- func: special_logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor + python_module: special + variants: function + +- func: special_logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + - func: special_expit(Tensor self) -> Tensor python_module: special variants: function - func: special_expit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: special variants: function +- func: special_sinc(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_round(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_log1p(Tensor self) -> Tensor + python_module: special + variants: function + +- func: special_log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_log_softmax(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + python_module: special + variants: function + +- func: special_gammainc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_gammainc(Tensor self, Tensor other) -> Tensor + python_module: special + variants: function + +- func: special_gammaincc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + +- func: special_gammaincc(Tensor self, Tensor other) -> Tensor + python_module: special + variants: function + +- func: special_multigammaln(Tensor self, int p) -> Tensor + python_module: special + variants: function + +- func: special_multigammaln.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!) + python_module: special + variants: function + ## Functions related to the fast Fourier transform and the torch.fft namespace # Note [FFT namespace binding] # Functions in the fft python module should have their names start with # "fft_" underscore and be bound to the desired Python name in # torch/fft/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/fft.h. @@ -9540,45 +10264,51 @@ # The "linalg_" names should be hidden from the user and not documented. # # See linalg_det as an example. # "_ex" stands for experimental -- func: linalg_cholesky_ex(Tensor self, *, bool check_errors=False) -> (Tensor L, Tensor info) +- func: linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info) python_module: linalg variants: function dispatch: CPU, CUDA: linalg_cholesky_ex -- func: linalg_cholesky_ex.L(Tensor self, *, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info) +- func: linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info) python_module: linalg variants: function dispatch: CPU, CUDA: linalg_cholesky_ex_out -- func: linalg_cholesky(Tensor self) -> Tensor +- func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor python_module: linalg variants: function -- func: linalg_cholesky.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +- func: linalg_cholesky.out(Tensor self, *, bool upper=False, Tensor(a!) out) -> Tensor(a!) python_module: linalg variants: function - func: linalg_det(Tensor self) -> Tensor python_module: linalg variants: function - dispatch: - CompositeExplicitAutograd: linalg_det - func: linalg_det.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: linalg - dispatch: - CompositeExplicitAutograd: linalg_det_out # torch.det, alias for torch.linalg.det - func: det(Tensor self) -> Tensor variants: function, method +- func: _det_lu_based_helper(Tensor self) -> (Tensor det, Tensor lu, Tensor pivs) + variants: function + dispatch: + CPU, CUDA: _det_lu_based_helper + +- func: _det_lu_based_helper_backward_helper(Tensor det_grad, Tensor det, Tensor self, Tensor lu, Tensor pivs) -> Tensor + variants: function + dispatch: + CPU, CUDA: _det_lu_based_helper_backward_helper + - func: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values) python_module: linalg variants: function dispatch: CompositeExplicitAutograd: linalg_lstsq @@ -9587,10 +10317,18 @@ python_module: linalg variants: function dispatch: CPU, CUDA: linalg_lstsq_out +# torch.linalg.matmul, alias for torch.matmul +- func: linalg_matmul(Tensor self, Tensor other) -> Tensor + python_module: linalg + variants: function + +- func: linalg_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + - func: linalg_slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet) python_module: linalg variants: function dispatch: CPU, CUDA: linalg_slogdet @@ -9619,23 +10357,25 @@ - func: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors) python_module: linalg variants: function dispatch: - CompositeExplicitAutograd: linalg_eigh + CPU, CUDA: linalg_eigh - func: linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) python_module: linalg dispatch: - CompositeExplicitAutograd: linalg_eigh_out + CPU, CUDA: linalg_eigh_out - func: linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor python_module: linalg variants: function - func: linalg_eigvalsh.out(Tensor self, str UPLO='L', *, Tensor(a!) out) -> Tensor(a!) python_module: linalg + dispatch: + CPU, CUDA: linalg_eigvalsh_out - func: linalg_householder_product(Tensor input, Tensor tau) -> Tensor python_module: linalg variants: function dispatch: @@ -9675,24 +10415,20 @@ - func: inner(Tensor self, Tensor other) -> Tensor variants: function, method - func: inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) -# torch.outer, alias for torch.ger - func: outer(Tensor self, Tensor vec2) -> Tensor variants: function, method - func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) +# torch.ger, alias for torch.outer - func: ger(Tensor self, Tensor vec2) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: ger - func: ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CompositeExplicitAutograd: ger_out - func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor python_module: linalg variants: function @@ -9776,26 +10512,20 @@ - func: linalg_pinv.out_rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!) python_module: linalg variants: function -- func: _linalg_solve_out_helper_(Tensor(a!) self, Tensor(b!) other, Tensor(c!) infos) -> Tensor(a!) - variants: function - dispatch: - CPU: _linalg_solve_out_helper_cpu - CUDA: _linalg_solve_out_helper_cuda - - func: linalg_solve(Tensor input, Tensor other) -> Tensor python_module: linalg variants: function dispatch: - CompositeExplicitAutograd: linalg_solve + CPU, CUDA: linalg_solve - func: linalg_solve.out(Tensor input, Tensor other, *, Tensor(a!) out) -> Tensor(a!) python_module: linalg dispatch: - CompositeExplicitAutograd: linalg_solve_out + CPU, CUDA: linalg_solve_out - func: linalg_tensorinv(Tensor self, int ind=2) -> Tensor python_module: linalg variants: function @@ -9895,13 +10625,13 @@ - func: segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor variants: function dispatch: CPU, CUDA: segment_reduce_kernel -- func: segment_reduce_backward(Tensor grad, Tensor output, Tensor data, *, Tensor? lengths=None) -> Tensor +- func: _segment_reduce_backward(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, int axis=0) -> Tensor variants: function dispatch: - CPU, CUDA: segment_reduce_backward_kernel + CPU, CUDA: _segment_reduce_backward_kernel - func: pad_sequence(Tensor[] sequences, bool batch_first=False, float padding_value=0.0) -> Tensor python_module: nn variants: function