codegen/native_functions.yaml in torch-rb-0.14.1 vs codegen/native_functions.yaml in torch-rb-0.15.0

- old
+ new

@@ -183,11 +183,11 @@ - func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> () dispatch: CompositeExplicitAutograd: sym_constrain_range -- func: sym_constrain_range_for_size(Scalar size, *, int? min, int? max) -> () +- func: sym_constrain_range_for_size(Scalar size, *, int? min=None, int? max=None) -> () dispatch: CompositeExplicitAutograd: sym_constrain_range_for_size - func: _functional_sym_constrain_range(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor dispatch: @@ -429,10 +429,11 @@ - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sgn_out + MPS: sgn_out_mps SparseCPU, SparseCUDA: sgn_sparse_out SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out tags: pointwise - func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor @@ -679,19 +680,33 @@ - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: all.out variants: function, method +- func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: all.dims_out + variants: function, method + cpp_no_default_args: ['dim'] + dispatch: + CompositeExplicitAutograd: all_dims_default + - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True - precomputed: - - dim -> int dim dispatch: CPU, CUDA: all_out MPS: all_out_mps +- func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: all_dims_out + CompositeExplicitAutograd: all_dims_out_default + cpp_no_default_args: ['dim'] + - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method - func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -707,19 +722,34 @@ device_check: NoCheck # TensorIterator structured_delegate: any.out variants: function, method tags: core +- func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor + device_check: NoCheck # TensorIterator + structured_delegate: any.dims_out + variants: function, method + cpp_no_default_args: ['dim'] + tags: core + dispatch: + CompositeExplicitAutograd: any_dims_default + - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True - precomputed: - - dim -> int dim dispatch: CPU, CUDA: any_out MPS: any_out_mps +- func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + dispatch: + CPU, CUDA: any_dims_out + CompositeExplicitAutograd: any_dims_out_default + cpp_no_default_args: ['dim'] + - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator variants: function, method - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -1324,10 +1354,11 @@ - func: cat(Tensor[] tensors, int dim=0) -> Tensor structured_delegate: cat.out dispatch: SparseCPU, SparseCUDA: cat_sparse QuantizedCPU: cat_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: cat_nested tags: core - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) structured: True precomputed: @@ -1611,88 +1642,97 @@ - func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a) variants: method manual_cpp_binding: True -- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor +- func: convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor dispatch: CompositeExplicitAutograd: convolution autogen: convolution.out tags: core -- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: CompositeExplicitAutograd, CUDA: convolution_backward autogen: convolution_backward.out tags: core -- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor +- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor dispatch: CompositeExplicitAutograd: convolution_overrideable autogen: convolution_overrideable.out -- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) +- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) dispatch: CompositeExplicitAutograd: convolution_backward_overrideable autogen: convolution_backward_overrideable.out -- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor +- func: _convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor dispatch: CompositeExplicitAutograd: _convolution autogen: _convolution.out -- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor +- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor -- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, int[] stride, str padding, int[] dilation, int groups) -> Tensor +- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, str padding, SymInt[] dilation, SymInt groups) -> Tensor + dispatch: + CompositeImplicitAutograd: _convolution_mode_symint -- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) -- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor +- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] dilation=1, SymInt groups=1) -> Tensor dispatch: CompositeImplicitAutograd: conv1d_symint -- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor +- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor dispatch: CompositeImplicitAutograd: conv2d_symint -- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor +- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, SymInt groups=1) -> Tensor dispatch: CompositeImplicitAutograd: conv3d_symint -- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, str padding="valid", int[1] dilation=1, int groups=1) -> Tensor +- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, str padding="valid", SymInt[1] dilation=1, SymInt groups=1) -> Tensor cpp_no_default_args: ['bias', 'stride', 'padding'] + dispatch: + CompositeImplicitAutograd: conv1d_padding_symint -- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, str padding="valid", int[2] dilation=1, int groups=1) -> Tensor +- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor cpp_no_default_args: ['bias', 'stride', 'padding'] + dispatch: + CompositeImplicitAutograd: conv2d_padding_symint -- func: conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, str padding="valid", int[3] dilation=1, int groups=1) -> Tensor +- func: conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, str padding="valid", SymInt[3] dilation=1, SymInt groups=1) -> Tensor cpp_no_default_args: ['bias', 'stride', 'padding'] + dispatch: + CompositeImplicitAutograd: conv3d_padding_symint - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor dispatch: CompositeExplicitAutograd: conv_tbc autogen: conv_tbc.out - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor) # NB: we inherit the goofy argument order from PyTorch torch.nn.functional -- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor +- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, SymInt groups=1, SymInt[1] dilation=1) -> Tensor dispatch: CompositeImplicitAutograd: conv_transpose1d_symint -- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor +- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor dispatch: CompositeImplicitAutograd: conv_transpose2d_symint -- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor +- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt groups=1, SymInt[3] dilation=1) -> Tensor dispatch: CompositeImplicitAutograd: conv_transpose3d_symint - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor variants: function dispatch: CompositeExplicitAutogradNonFunctional: copy + tags: core - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False @@ -1718,10 +1758,12 @@ - func: cos(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method structured_delegate: cos.out + dispatch: + NestedTensorCPU, NestedTensorCUDA: cos_nested tags: [core, pointwise] - func: cos_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method @@ -1800,36 +1842,36 @@ - func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor) dispatch: CUDA: cudnn_batch_norm_backward autogen: cudnn_batch_norm_backward.out -- func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor +- func: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor dispatch: CUDA: cudnn_convolution autogen: cudnn_convolution.out -- func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor +- func: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor dispatch: CUDA: cudnn_convolution_transpose autogen: cudnn_convolution_transpose.out -- func: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor +- func: _mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor dispatch: MPS: _mps_convolution_transpose autogen: _mps_convolution_transpose.out -- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[2] output_mask) -> (Tensor, Tensor) +- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask) -> (Tensor, Tensor) dispatch: MPS: mps_convolution_transpose_backward autogen: mps_convolution_transpose_backward.out -- func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor +- func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor dispatch: CUDA: cudnn_convolution_relu autogen: cudnn_convolution_relu.out -- func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor +- func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor dispatch: CUDA: cudnn_convolution_add_relu autogen: cudnn_convolution_add_relu.out # NB: input is special cased in a way I don't quite understand @@ -1965,10 +2007,11 @@ - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) dispatch: CPU: ctc_loss_cpu CUDA: ctc_loss_gpu + Meta: ctc_loss_meta autogen: _ctc_loss.out tags: dynamic_output_shape # the shape of second output is data dependent - func: _ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) dispatch: @@ -1997,10 +2040,11 @@ - func: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a) variants: function, method dispatch: CompositeExplicitAutograd: diagonal + tags: core - func: linalg_diagonal(Tensor(a) A, *, int offset=0, int dim1=-2, int dim2=-1) -> Tensor(a) python_module: linalg variants: function @@ -2077,11 +2121,11 @@ device_check: NoCheck # TensorIterator variants: function, method structured_delegate: div.out_mode dispatch: SparseCPU, SparseCUDA: div_sparse - tags: pointwise + tags: [core, pointwise] - func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method structured_delegate: div.out_mode @@ -2118,11 +2162,11 @@ - func: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: div - tags: pointwise + tags: [core, pointwise] - func: div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!) variants: method dispatch: CompositeExplicitAutograd: div_ @@ -2368,11 +2412,11 @@ - func: resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!) use_const_ref_for_mutable_tensors: True variants: method device_check: NoCheck device_guard: False - tags: inplace_view + tags: [core, inplace_view] dispatch: Meta: resize__symint CPU: resize_ CUDA: resize_cuda_ MPS: resize_mps_ @@ -2515,11 +2559,11 @@ structured_delegate: expm1.out variants: function, method dispatch: SparseCPU, SparseCUDA: expm1_sparse SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr - tags: pointwise + tags: [core, pointwise] - func: expm1_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: expm1.out variants: function, method @@ -2682,14 +2726,19 @@ SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method + dispatch: + CompositeExplicitAutograd: floor_divide - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method + dispatch: + CompositeExplicitAutograd: floor_divide_ + autogen: floor_divide.Scalar_out - func: frac(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: frac.out variants: function, method @@ -2977,11 +3026,11 @@ # Used by inductor to signal indexing without bounds checks # Note that we don't support boolean indexing, to avoid dynamic output shapes - func: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor variants: function dispatch: - CPU, CUDA: _unsafe_index + CompositeExplicitAutograd: _unsafe_index - func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!) structured: True variants: function precomputed: @@ -3251,18 +3300,22 @@ - func: _cslt_compress(Tensor input) -> Tensor dispatch: CUDA: _cslt_compress -- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, bool transpose_result=False) -> Tensor +- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> Tensor dispatch: CUDA: _cslt_sparse_mm - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None) -> Tensor dispatch: CUDA: _sparse_semi_structured_linear +- func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor + dispatch: + CUDA: _mixed_dtypes_linear + - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int) @@ -3289,16 +3342,46 @@ - func: linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CompositeExplicitAutograd: linspace +- func: linspace.Tensor_Tensor(Tensor start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace + +- func: linspace.Tensor_Scalar(Tensor start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace + +- func: linspace.Scalar_Tensor(Scalar start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace + - func: linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, Meta: linspace_out CUDA: linspace_cuda_out MPS: linspace_out_mps +- func: linspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace_out + +- func: linspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace_out + +- func: linspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: linspace_out + - func: log(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: log.out variants: function, method tags: [core, pointwise] @@ -3320,11 +3403,11 @@ - func: log10(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: log10.out variants: function, method - tags: pointwise + tags: [core, pointwise] - func: log10_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: log10.out variants: function, method @@ -3344,11 +3427,11 @@ structured_delegate: log1p.out variants: function, method dispatch: SparseCPU, SparseCUDA: log1p_sparse SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr - tags: pointwise + tags: [core, pointwise] - func: log1p_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: log1p.out variants: function, method @@ -3370,11 +3453,11 @@ - func: log2(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: log2.out variants: function, method - tags: pointwise + tags: [core, pointwise] - func: log2_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: log2.out variants: function, method @@ -3475,15 +3558,45 @@ - func: logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CompositeExplicitAutograd: logspace +- func: logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace + +- func: logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace + +- func: logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace + - func: logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, Meta: logspace_out CUDA: logspace_cuda_out +- func: logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace_out + +- func: logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace_out + +- func: logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) + category_override: factory + dispatch: + CompositeExplicitAutograd: logspace_out + # log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor variants: function, method - func: log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) @@ -3845,21 +3958,21 @@ MPS: amin_out_mps # TODO: Add this function to MPS dispatch key so that we avoid declaring it in # native_functions.yaml # https://github.com/pytorch/pytorch/issues/77394 -- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor +- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor dispatch: MPS: _mps_convolution autogen: _mps_convolution.out -- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: MPS: mps_convolution_backward autogen: mps_convolution_backward.out -- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor +- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor dispatch: CompositeExplicitAutograd: mkldnn_convolution autogen: mkldnn_convolution.out - func: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor) @@ -3881,30 +3994,30 @@ - func: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor) dispatch: CUDA: miopen_batch_norm_backward autogen: miopen_batch_norm_backward.out -- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_convolution autogen: miopen_convolution.out -- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_convolution_transpose autogen: miopen_convolution_transpose.out -- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_depthwise_convolution autogen: miopen_depthwise_convolution.out -- func: miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor +- func: miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor dispatch: CUDA: miopen_convolution_relu -- func: miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor +- func: miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor dispatch: CUDA: miopen_convolution_add_relu - func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) dispatch: @@ -3941,10 +4054,18 @@ - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) dispatch: CUDA: _int_mm_out_cuda +- func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor + dispatch: + CUDA: _convert_weight_to_int4pack_cuda + +- func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor + dispatch: + CUDA: _weight_int4pack_mm_cuda + - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor python_module: sparse - func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor python_module: sparse @@ -4085,10 +4206,11 @@ variants: function, method device_check: NoCheck device_guard: False dispatch: CompositeImplicitAutograd: narrow_symint + NestedTensorCPU, NestedTensorCUDA: narrow_nested_symint - func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a) variants: function, method device_check: NoCheck device_guard: False @@ -4197,11 +4319,11 @@ - func: is_vulkan_available() -> bool - func: _nnpack_available() -> bool -- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor +- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor variants: function dispatch: CompositeExplicitAutograd: _nnpack_spatial_convolution autogen: _nnpack_spatial_convolution.out @@ -4312,35 +4434,37 @@ variants: function, method - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor dispatch: CPU: pixel_shuffle_cpu + MPS: pixel_shuffle_mps CompositeExplicitAutogradNonFunctional: math_pixel_shuffle autogen: pixel_shuffle.out tags: core - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor dispatch: CPU: pixel_unshuffle_cpu + MPS: pixel_unshuffle_mps CompositeExplicitAutogradNonFunctional: math_pixel_unshuffle autogen: pixel_unshuffle.out -- func: channel_shuffle(Tensor self, int groups) -> Tensor +- func: channel_shuffle(Tensor self, SymInt groups) -> Tensor dispatch: CPU, CUDA: channel_shuffle QuantizedCPU: channel_shuffle_quantized_cpu autogen: channel_shuffle.out -- func: native_channel_shuffle(Tensor self, int groups) -> Tensor +- func: native_channel_shuffle(Tensor self, SymInt groups) -> Tensor dispatch: CPU: channel_shuffle_cpu CompositeImplicitAutograd: math_channel_shuffle - func: is_pinned(Tensor self, Device? device=None) -> bool variants: method dispatch: - CUDA: is_pinned_cuda + NestedTensorCUDA, CUDA: is_pinned_cuda MPS: is_pinned_mps CompositeExplicitAutograd: is_pinned_default # TODO: add a copy kwarg that guarantees that the tensor is put into fresh # pinned memory @@ -4350,10 +4474,11 @@ # Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor - func: _pin_memory(Tensor self, Device? device=None) -> Tensor dispatch: CUDA: _pin_memory_cuda MPS: _pin_memory_mps + NestedTensorCUDA, NestedTensorCPU: _pin_memory_nested autogen: _pin_memory.out - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor variants: function, method @@ -4658,23 +4783,25 @@ CompositeExplicitAutograd: repeat MPS: repeat_mps autogen: repeat.out tags: core -- func: repeat_interleave.Tensor(Tensor repeats, *, int? output_size=None) -> Tensor +- func: repeat_interleave.Tensor(Tensor repeats, *, SymInt? output_size=None) -> Tensor variants: function dispatch: CPU: repeat_interleave_cpu CUDA: repeat_interleave_cuda MPS: repeat_interleave_mps tags: dynamic_output_shape autogen: repeat_interleave.Tensor_out -- func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, int? output_size=None) -> Tensor +- func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor variants: function, method + dispatch: + CompositeImplicitAutograd: repeat_interleave_symint -- func: repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, int? output_size=None) -> Tensor +- func: repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor variants: function, method dispatch: CompositeImplicitAutograd: repeat_interleave_symint - func: reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a) @@ -4971,39 +5098,44 @@ - func: silu(Tensor self) -> Tensor structured_delegate: silu.out python_module: nn dispatch: NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu + tags: pointwise - func: silu_(Tensor(a!) self) -> Tensor(a!) structured_delegate: silu.out python_module: nn dispatch: NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_ + tags: pointwise - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: silu_out MPS: silu_out_mps + tags: pointwise - func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: silu_backward_out MPS: silu_backward_out_mps + tags: pointwise - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor structured_delegate: silu_backward.grad_input python_module: nn dispatch: CompositeImplicitAutograd: math_silu_backward NestedTensorCPU, NestedTensorCUDA: silu_backward_nested + tags: pointwise - func: mish(Tensor self) -> Tensor structured_delegate: mish.out python_module: nn @@ -5015,15 +5147,17 @@ structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: mish_out + MPS: mish_out_mps - func: mish_backward(Tensor grad_output, Tensor self) -> Tensor python_module: nn dispatch: CPU, CUDA: mish_backward + MPS: mish_backward_mps CompositeImplicitAutograd: math_mish_backward - func: sigmoid(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: sigmoid.out @@ -5074,10 +5208,11 @@ structured_delegate: sin.out variants: function, method dispatch: SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr SparseCPU, SparseCUDA: sin_sparse + NestedTensorCPU, NestedTensorCUDA: sin_nested tags: [core, pointwise] - func: sin_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: sin.out @@ -5969,11 +6104,11 @@ device_check: NoCheck # TensorIterator variants: function, method dispatch: SparseCPU, SparseCUDA: trunc_sparse SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr - tags: pointwise + tags: [core, pointwise] - func: trunc_(Tensor(a!) self) -> Tensor(a!) structured_delegate: trunc.out device_check: NoCheck # TensorIterator variants: function, method @@ -6194,17 +6329,19 @@ - func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) variants: function dispatch: CPU: weight_norm_cpu CUDA: weight_norm_cuda + MPS: weight_norm_mps autogen: _weight_norm_interface.out - func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) variants: function dispatch: CPU: weight_norm_backward_cpu CUDA: weight_norm_backward_cuda + MPS: weight_norm_backward_mps autogen: _weight_norm_interface_backward.out - func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) variants: function @@ -6217,10 +6354,11 @@ - func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CPU: _efficientzerotensor CUDA: _efficientzerotensor_cuda + MPS: _efficientzerotensor_mps Meta: _efficientzerotensor_meta autogen: _efficientzerotensor.out - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: @@ -6673,16 +6811,16 @@ - func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor structured_delegate: _addmm_activation.out variants: function, method -- func: _scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None) -> (Tensor, Tensor) +- func: _scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False) -> (Tensor, Tensor) variants: function dispatch: CUDA: _scaled_mm_cuda -- func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!)) +- func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!)) variants: function dispatch: CUDA: _scaled_mm_out_cuda # NOTE [ Sparse: autograd and API ] @@ -7053,11 +7191,11 @@ # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors - func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[] variants: function, method dispatch: CompositeExplicitAutograd: unbind - CompositeImplicitAutogradNestedTensor: NestedTensor_unbind + NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[] variants: function, method - func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor @@ -7141,18 +7279,18 @@ variants: method dispatch: CPU: dense_to_mkldnn autogen: to_mkldnn.out -- func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1, int[]? input_size=None) -> Tensor +- func: mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor variants: function python_module: nn dispatch: MkldnnCPU: mkldnn_reorder_conv2d_weight autogen: mkldnn_reorder_conv2d_weight.out -- func: mkldnn_reorder_conv3d_weight(Tensor self, int[3] padding=0, int[3] stride=1, int[3] dilation=1, int groups=1) -> Tensor +- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor variants: function python_module: nn dispatch: MkldnnCPU: mkldnn_reorder_conv3d_weight autogen: mkldnn_reorder_conv3d_weight.out @@ -7654,10 +7792,14 @@ - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: masked_scatter +- func: masked_scatter_backward(Tensor grad_output, Tensor mask, SymInt[] sizes) -> Tensor + dispatch: + CompositeExplicitAutograd: masked_scatter_backward_symint + - func: _masked_softmax(Tensor self, Tensor mask, int? dim=None, int? mask_type=None) -> Tensor dispatch: CUDA: masked_softmax_cuda CPU: masked_softmax_cpu autogen: _masked_softmax.out @@ -7936,10 +8078,12 @@ tags: [core, pointwise] - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method + dispatch: + CompositeExplicitAutograd: bitwise_and_ tags: pointwise - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method @@ -7980,10 +8124,12 @@ tags: pointwise - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_or tags: [core, pointwise] - func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function @@ -7999,10 +8145,12 @@ tags: [core, pointwise] - func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method + dispatch: + CompositeExplicitAutograd: bitwise_or_ tags: pointwise - func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method @@ -8043,10 +8191,12 @@ tags: pointwise - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function + dispatch: + CompositeExplicitAutograd: bitwise_xor tags: [core, pointwise] - func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function @@ -8062,10 +8212,12 @@ tags: [core, pointwise] - func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method + dispatch: + CompositeExplicitAutograd: bitwise_xor_ tags: pointwise - func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method @@ -8502,10 +8654,11 @@ structured_delegate: eq.Scalar_out device_check: NoCheck # TensorIterator variants: method, function dispatch: QuantizedCPU: eq_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: eq_scalar_nested tags: [core, pointwise] - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase @@ -8538,10 +8691,11 @@ structured_delegate: ge.Scalar_out device_check: NoCheck # TensorIterator variants: method, function dispatch: QuantizedCPU: ge_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: ge_scalar_nested tags: [core, pointwise] - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase @@ -8664,10 +8818,11 @@ structured_delegate: gt.Scalar_out device_check: NoCheck # TensorIterator variants: method, function dispatch: QuantizedCPU: gt_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: gt_scalar_nested tags: [core, pointwise] - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase @@ -9104,10 +9259,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: lgamma_out + MPS: lgamma_out_mps tags: pointwise - func: lgamma_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: lgamma.out @@ -9124,10 +9280,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: digamma_out + MPS: digamma_out_mps tags: pointwise - func: digamma(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: digamma.out @@ -9138,10 +9295,11 @@ device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: polygamma_out + MPS: polygamma_out_mps tags: pointwise - func: polygamma(int n, Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: polygamma.out @@ -9261,11 +9419,11 @@ structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: atan2_out MPS: atan2_out_mps - tags: pointwise + tags: [core, pointwise] - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: atan2.out variants: method @@ -9273,11 +9431,11 @@ - func: atan2(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: atan2.out variants: method, function - tags: pointwise + tags: [core, pointwise] # arctan2, alias of atan2 - func: arctan2(Tensor self, Tensor other) -> Tensor variants: method, function @@ -9462,11 +9620,11 @@ - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: nextafter_out + CPU, CUDA, MPS: nextafter_out tags: pointwise - func: nextafter(Tensor self, Tensor other) -> Tensor structured_delegate: nextafter.out variants: method, function @@ -9809,11 +9967,11 @@ tags: pointwise - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: pow.Scalar_out - tags: pointwise + tags: [core, pointwise] - func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase @@ -10018,10 +10176,25 @@ dispatch: CPU: foreach_tensor_add_scalarlist_kernel_slow_ CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ autogen: _foreach_add.ScalarList_out +- func: _foreach_add.Tensor(Tensor[] self, Tensor other, *, Scalar alpha=1) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_add_tensor_kernel_slow + CUDA: foreach_tensor_add_tensor_kernel_cuda + +- func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_add_tensor_kernel_slow_ + CUDA: foreach_tensor_add_tensor_kernel_cuda_ + autogen: _foreach_add.Tensor_out + - func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_sub_scalar_kernel_slow @@ -10168,10 +10341,25 @@ dispatch: CPU: foreach_tensor_div_scalarlist_kernel_slow_ CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ autogen: _foreach_div.ScalarList_out +- func: _foreach_div.Tensor(Tensor[] self, Tensor other) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_div_tensor_kernel_slow + CUDA: foreach_tensor_div_tensor_kernel_cuda + +- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CPU: foreach_tensor_div_tensor_kernel_slow_ + CUDA: foreach_tensor_div_tensor_kernel_cuda_ + autogen: _foreach_div.Tensor_out + - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices variants: function dispatch: CPU: foreach_tensor_clamp_max_scalar_kernel_slow @@ -10988,41 +11176,48 @@ - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor dispatch: CPU: bucketize_cpu CUDA: bucketize_cuda + MPS: bucketize_mps - func: bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: bucketize_out_cpu CUDA: bucketize_out_cuda + MPS: bucketize_out_mps - func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor dispatch: CPU: bucketize_cpu CUDA: bucketize_cuda + MPS: bucketize_mps autogen: bucketize.Scalar_out - func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor dispatch: CPU: searchsorted_cpu CUDA: searchsorted_cuda + MPS: searchsorted_mps - func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: searchsorted_out_cpu CUDA: searchsorted_out_cuda + MPS: searchsorted_out_mps - func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor dispatch: CPU: searchsorted_cpu CUDA: searchsorted_cuda + MPS: searchsorted_mps - func: searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: searchsorted_out_cpu CUDA: searchsorted_out_cuda + MPS: searchsorted_out_mps - func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor structured_delegate: _convert_indices_from_coo_to_csr.out - func: _convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!) @@ -11566,10 +11761,11 @@ structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator python_module: nn dispatch: CPU, CUDA: softshrink_out + MPS: softshrink_out_mps - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor structured_delegate: softshrink.out device_check: NoCheck # TensorIterator python_module: nn @@ -11578,10 +11774,11 @@ structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: softshrink_backward_out + MPS: softshrink_backward_out_mps - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor structured_delegate: softshrink_backward.grad_input python_module: nn @@ -12480,105 +12677,105 @@ # one that is written in the native style: modern C++. Algorithmically, # these are the same thing, but we give them different prefixes to # make the operational distinction clear. tags: pointwise -- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!) +- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn structured: True dispatch: CPU: slow_conv_transpose2d_structured_cpu CUDA: slow_conv_transpose2d_structured_cuda -- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor +- func: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor python_module: nn structured_delegate: slow_conv_transpose2d.out -- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!) +- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: slow_conv_transpose3d_out_cpu CUDA: slow_conv_transpose3d_out_cuda -- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor +- func: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor python_module: nn dispatch: CPU: slow_conv_transpose3d_cpu CUDA: slow_conv_transpose3d_cuda -- func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!) +- func: thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -- func: thnn_conv2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0) -> Tensor +- func: thnn_conv2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0) -> Tensor python_module: nn -- func: _slow_conv2d_forward.output(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, *, Tensor(a!) output) -> Tensor(a!) +- func: _slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!) python_module: nn dispatch: CPU: slow_conv2d_forward_out_cpu CUDA: slow_conv2d_forward_out_cuda -- func: _slow_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> Tensor +- func: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor python_module: nn dispatch: CPU: slow_conv2d_forward_cpu CUDA: slow_conv2d_forward_cuda -- func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) +- func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) python_module: nn dispatch: CPU: slow_conv2d_backward_out_cpu CUDA: slow_conv2d_backward_out_cuda -- func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) +- func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) python_module: nn dispatch: CPU: slow_conv2d_backward_cpu CUDA: slow_conv2d_backward_cuda autogen: _slow_conv2d_backward.output_mask_out -- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!) +- func: _conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!) use_const_ref_for_mutable_tensors: True python_module: nn dispatch: CUDA: conv_depthwise2d_cuda_out -- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor +- func: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor python_module: nn dispatch: CUDA: conv_depthwise2d_cuda -- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor +- func: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor python_module: nn dispatch: CUDA: conv_depthwise3d_cuda autogen: conv_depthwise3d.out -- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!) +- func: slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0) -> Tensor +- func: slow_conv3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0) -> Tensor python_module: nn -- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!) +- func: slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!) python_module: nn dispatch: CPU: slow_conv3d_forward_out_cpu -- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor +- func: slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor python_module: nn dispatch: CPU: slow_conv3d_forward_cpu -- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor +- func: slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor python_module: nn dispatch: CPU: slow_conv_dilated2d_cpu CUDA: slow_conv_dilated2d_cuda autogen: slow_conv_dilated2d.out -- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor +- func: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor python_module: nn dispatch: CPU: slow_conv_dilated3d_cpu CUDA: slow_conv_dilated3d_cuda autogen: slow_conv_dilated3d.out @@ -14267,23 +14464,24 @@ - func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor) variants: function tags: nondeterministic_seeded -- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) +- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) dispatch: CPU: _scaled_dot_product_flash_attention_cpu CUDA: _scaled_dot_product_flash_attention_cuda NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda tags: nondeterministic_seeded -- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value) +- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value) device_check: NoCheck variants: function dispatch: CPU: _scaled_dot_product_flash_attention_backward_cpu CUDA: _scaled_dot_product_flash_attention_backward_cuda + NestedTensorCUDA: _scaled_dot_product_flash_attention_backward_nested - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset) dispatch: CUDA: _scaled_dot_product_efficient_attention_cuda NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda @@ -14293,30 +14491,30 @@ device_check: NoCheck dispatch: CUDA: _scaled_dot_product_efficient_attention_backward_cuda tags: nondeterministic_seeded -- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) +- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) variants: function dispatch: CUDA: _flash_attention_forward tags: nondeterministic_seeded -- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor) +- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor) device_check: NoCheck variants: function dispatch: CUDA: _flash_attention_backward # Returns ouput, logsumexp if compute_logsumexp -- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset) +- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k) variants: function dispatch: CUDA: _efficient_attention_forward tags: nondeterministic_seeded -- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_k, int max_seqlen_q, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor) +- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor) device_check: NoCheck variants: function dispatch: CUDA: _efficient_attention_backward @@ -14420,16 +14618,20 @@ structured_delegate: special_chebyshev_polynomial_t.out variants: function tags: pointwise - func: special_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_t device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_t device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14442,10 +14644,12 @@ structured: True variants: function tags: pointwise - func: special_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_t_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14463,16 +14667,20 @@ structured_delegate: special_chebyshev_polynomial_u.out variants: function tags: pointwise - func: special_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_u device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_u device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14485,10 +14693,12 @@ structured: True variants: function tags: pointwise - func: special_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_u_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14506,16 +14716,20 @@ structured_delegate: special_chebyshev_polynomial_v.out variants: function tags: pointwise - func: special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_v device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_v device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14528,10 +14742,12 @@ structured: True variants: function tags: pointwise - func: special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_v_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14549,16 +14765,20 @@ structured_delegate: special_chebyshev_polynomial_w.out variants: function tags: pointwise - func: special_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_w device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_w device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14571,10 +14791,12 @@ structured: True variants: function tags: pointwise - func: special_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_chebyshev_polynomial_w_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14592,16 +14814,20 @@ structured_delegate: special_hermite_polynomial_h.out variants: function tags: pointwise - func: special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_h device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_h device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14614,10 +14840,12 @@ structured: True variants: function tags: pointwise - func: special_hermite_polynomial_h.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_h_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14635,16 +14863,20 @@ structured_delegate: special_hermite_polynomial_he.out variants: function tags: pointwise - func: special_hermite_polynomial_he.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_he device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_hermite_polynomial_he.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_he device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14657,10 +14889,12 @@ structured: True variants: function tags: pointwise - func: special_hermite_polynomial_he.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_hermite_polynomial_he_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14678,16 +14912,20 @@ structured_delegate: special_laguerre_polynomial_l.out variants: function tags: pointwise - func: special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_laguerre_polynomial_l device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_laguerre_polynomial_l device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14700,10 +14938,12 @@ structured: True variants: function tags: pointwise - func: special_laguerre_polynomial_l.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_laguerre_polynomial_l_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14721,16 +14961,20 @@ structured_delegate: special_legendre_polynomial_p.out variants: function tags: pointwise - func: special_legendre_polynomial_p.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_legendre_polynomial_p device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_legendre_polynomial_p.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_legendre_polynomial_p device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14743,10 +14987,12 @@ structured: True variants: function tags: pointwise - func: special_legendre_polynomial_p.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_legendre_polynomial_p_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14854,16 +15100,20 @@ structured_delegate: special_shifted_chebyshev_polynomial_t.out variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14876,10 +15126,12 @@ structured: True variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14897,16 +15149,20 @@ structured_delegate: special_shifted_chebyshev_polynomial_u.out variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14919,10 +15175,12 @@ structured: True variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14940,16 +15198,20 @@ structured_delegate: special_shifted_chebyshev_polynomial_v.out variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14962,10 +15224,12 @@ structured: True variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v_out device_check: NoCheck python_module: special variants: function tags: pointwise @@ -14983,16 +15247,20 @@ structured_delegate: special_shifted_chebyshev_polynomial_w.out variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w device_check: NoCheck python_module: special variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w device_check: NoCheck python_module: special variants: function tags: pointwise @@ -15005,9 +15273,11 @@ structured: True variants: function tags: pointwise - func: special_shifted_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w_out device_check: NoCheck python_module: special variants: function tags: pointwise