codegen/native_functions.yaml in torch-rb-0.17.1 vs codegen/native_functions.yaml in torch-rb-0.18.0

- old
+ new

@@ -336,30 +336,30 @@ device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: abs SparseCPU, SparseCUDA: abs_sparse - SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs tags: [core, pointwise] - func: abs_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: abs_ SparseCPU, SparseCUDA: abs_sparse_ - SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_ NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_ - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: abs_out MPS: abs_out_mps SparseCPU, SparseCUDA: abs_sparse_out - SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out tags: pointwise # Note [Adding an alias] # To add an alias do the following: # @@ -398,18 +398,18 @@ - func: angle(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: angle - SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: angle_sparse_csr tags: pointwise - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: angle_out - SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: angle_sparse_csr_out tags: pointwise - func: view_as_real(Tensor(a) self) -> Tensor(a) variants: function dispatch: @@ -423,31 +423,31 @@ - func: sgn(Tensor self) -> Tensor variants: function, method structured_delegate: sgn.out dispatch: SparseCPU, SparseCUDA: sgn_sparse - SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn tags: pointwise - func: sgn_(Tensor(a!) self) -> Tensor(a!) variants: method structured_delegate: sgn.out dispatch: SparseCPU, SparseCUDA: sgn_sparse_ - SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_ NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_ tags: pointwise - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sgn_out MPS: sgn_out_mps SparseCPU, SparseCUDA: sgn_sparse_out - SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out tags: pointwise - func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor variants: method @@ -470,11 +470,11 @@ - func: _conj_physical(Tensor self) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: _conj_physical - SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr autogen: _conj_physical.out - func: conj_physical(Tensor self) -> Tensor variants: function, method tags: pointwise @@ -482,18 +482,18 @@ - func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: conj_physical_out MPS: conj_physical_out_mps SparseCPU, SparseCUDA: conj_physical_out_sparse - SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out tags: pointwise - func: conj_physical_(Tensor(a!) self) -> Tensor(a!) variants: function, method dispatch: CompositeExplicitAutograd: conj_physical_ - SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_ tags: pointwise - func: resolve_conj(Tensor(a) self) -> Tensor(a) variants: function, method @@ -535,13 +535,15 @@ - func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor tags: core + autogen: avg_pool1d.out - func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor tags: core + autogen: adaptive_avg_pool1d.out # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor) - func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor @@ -692,11 +694,14 @@ - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: all.out variants: function, method + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_all + - func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: all.dims_out variants: function, method cpp_no_default_args: ['dim'] @@ -861,29 +866,29 @@ - func: asinh(Tensor self) -> Tensor variants: function, method structured_delegate: asinh.out dispatch: SparseCPU, SparseCUDA: asinh_sparse - SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr tags: [core, pointwise] - func: asinh_(Tensor(a!) self) -> Tensor(a!) variants: function, method structured_delegate: asinh.out dispatch: SparseCPU, SparseCUDA: asinh_sparse_ - SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_ tags: pointwise - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: asinh_out MPS: asinh_out_mps SparseCPU, SparseCUDA: asinh_sparse_out - SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out tags: pointwise # arcsinh, alias for asinh - func: arcsinh(Tensor self) -> Tensor variants: function, method @@ -896,29 +901,29 @@ - func: atanh(Tensor self) -> Tensor structured_delegate: atanh.out variants: function, method dispatch: SparseCPU, SparseCUDA: atanh_sparse - SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr tags: [core, pointwise] - func: atanh_(Tensor(a!) self) -> Tensor(a!) structured_delegate: atanh.out variants: function, method dispatch: SparseCPU, SparseCUDA: atanh_sparse_ - SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_ tags: pointwise - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: atanh_out MPS: atanh_out_mps SparseCPU, SparseCUDA: atanh_sparse_out - SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out tags: pointwise # arctanh, alias for atanh - func: arctanh(Tensor self) -> Tensor variants: function, method @@ -952,31 +957,31 @@ device_check: NoCheck # TensorIterator variants: function, method structured_delegate: asin.out dispatch: SparseCPU, SparseCUDA: asin_sparse - SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr tags: [core, pointwise] - func: asin_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method structured_delegate: asin.out dispatch: SparseCPU, SparseCUDA: asin_sparse_ - SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_ tags: pointwise - func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: asin_out MPS: asin_out_mps SparseCPU, SparseCUDA: asin_sparse_out - SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out tags: pointwise # arcsin, alias of asin - func: arcsin(Tensor self) -> Tensor variants: function, method @@ -990,31 +995,31 @@ device_check: NoCheck # TensorIterator structured_delegate: atan.out variants: function, method dispatch: SparseCPU, SparseCUDA: atan_sparse - SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr tags: [core, pointwise] - func: atan_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: atan.out variants: function, method dispatch: SparseCPU, SparseCUDA: atan_sparse_ - SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_ tags: pointwise - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: atan_out MPS: atan_out_mps SparseCPU, SparseCUDA: atan_sparse_out - SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out tags: pointwise # arctan, alias of atan - func: arctan(Tensor self) -> Tensor variants: function, method @@ -1421,31 +1426,31 @@ device_check: NoCheck # TensorIterator structured_delegate: ceil.out variants: function, method dispatch: SparseCPU, SparseCUDA: ceil_sparse - SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr tags: [core, pointwise] - func: ceil_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: ceil.out variants: function, method dispatch: SparseCPU, SparseCUDA: ceil_sparse_ - SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_ tags: pointwise - func: ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: ceil_out MPS: ceil_out_mps SparseCPU, SparseCUDA: ceil_sparse_out - SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out tags: pointwise # alias for torch.linalg.multi_dot - func: chain_matmul(Tensor[] matrices) -> Tensor variants: function @@ -1760,11 +1765,11 @@ device_guard: False dispatch: MkldnnCPU: copy_mkldnn_ SparseCPU, SparseCUDA: copy_sparse_wrapper_ CompositeExplicitAutograd: copy_ - SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_ NestedTensorCPU, NestedTensorCUDA: copy_nested_ autogen: copy.out - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor dispatch: @@ -2336,11 +2341,11 @@ autogen: _embedding_bag.out tags: core - func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor dispatch: - CompositeImplicitAutograd: _embedding_bag_backward_symint + CPU, CUDA: _embedding_bag_backward_symint - func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor dispatch: CompositeImplicitAutograd: _embedding_bag_sparse_backward_symint @@ -2368,12 +2373,14 @@ CPU: empty_cpu CUDA: empty_cuda MPS: empty_mps Meta: empty_meta_symint MkldnnCPU: empty_mkldnn - SparseCPU, SparseCUDA, SparseMeta: empty_sparse - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_sparse_compressed + SparseCPU, SparseCUDA: empty_sparse + SparseMeta: empty_sparse_symint + SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed + SparseCsrMeta: empty_sparse_compressed_symint QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized tags: core - func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: @@ -2444,11 +2451,11 @@ Meta: resize__symint CPU: resize_ CUDA: resize_cuda_ MPS: resize_mps_ QuantizedCPU: quantized_resize_cpu_ - SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: resize_sparse_csr_ autogen: resize, resize.out # This is a utility function to enable users to resize out tensor while registering kernels for out variants. # Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration # to make it easy to register out variants for ops. @@ -2495,31 +2502,31 @@ device_check: NoCheck # TensorIterator structured_delegate: erf.out variants: function, method dispatch: SparseCPU, SparseCUDA: erf_sparse - SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr tags: [core, pointwise] - func: erf_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: erf.out variants: function, method dispatch: SparseCPU, SparseCUDA: erf_sparse_ - SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_ tags: pointwise - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: erf_out MPS: erf_out_mps SparseCPU, SparseCUDA: erf_sparse_out - SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out tags: pointwise - func: erfc(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: erfc.out @@ -2583,31 +2590,31 @@ device_check: NoCheck # TensorIterator structured_delegate: expm1.out variants: function, method dispatch: SparseCPU, SparseCUDA: expm1_sparse - SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr tags: [core, pointwise] - func: expm1_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: expm1.out variants: function, method dispatch: SparseCPU, SparseCUDA: expm1_sparse_ - SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_ tags: pointwise - func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: expm1_out MPS: expm1_out_mps SparseCPU, SparseCUDA: expm1_sparse_out - SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out tags: pointwise - func: expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a) variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. device_check: NoCheck @@ -2681,11 +2688,11 @@ dispatch: CPU, CUDA: fill_ MPS: fill_scalar_mps QuantizedCPU, QuantizedCUDA: fill_quantized_ Meta: fill_meta_ - SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: fill_sparse_csr_ NestedTensorCPU, NestedTensorCUDA: fill_nested_ autogen: fill.Scalar_out - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -2702,31 +2709,31 @@ device_check: NoCheck # TensorIterator structured_delegate: floor.out variants: function, method dispatch: SparseCPU, SparseCUDA: floor_sparse - SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr tags: [core, pointwise] - func: floor_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: floor.out variants: function, method dispatch: SparseCPU, SparseCUDA: floor_sparse_ - SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_ tags: pointwise - func: floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: floor_out MPS: floor_out_mps SparseCPU, SparseCUDA: floor_sparse_out - SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out tags: pointwise - func: floor_divide(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -2767,31 +2774,31 @@ device_check: NoCheck # TensorIterator structured_delegate: frac.out variants: function, method dispatch: SparseCPU, SparseCUDA: frac_sparse - SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr tags: pointwise - func: frac_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: frac.out variants: function, method dispatch: SparseCPU, SparseCUDA: frac_sparse_ - SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_ tags: pointwise - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: frac_out MPS: frac_out_mps SparseCPU, SparseCUDA: frac_sparse_out - SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out tags: pointwise - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_check: NoCheck device_guard: False @@ -3059,10 +3066,22 @@ - func: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor variants: function dispatch: CompositeExplicitAutograd: _unsafe_index +# Used by inductor to generate masked loads +# Note that we don't support boolean indexing, to avoid dynamic output shapes +- func: _unsafe_masked_index(Tensor self, Tensor mask, Tensor?[] indices, Scalar fill) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _unsafe_masked_index + +- func: _unsafe_masked_index_put_accumulate(Tensor self, Tensor mask, Tensor?[] indices, Tensor values) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _unsafe_masked_index_put_accumulate + - func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!) structured: True variants: function precomputed: - dim -> int dim @@ -3159,11 +3178,11 @@ device_check: NoCheck device_guard: False dispatch: CPU, CUDA, MPS: isnan SparseCPU, SparseCUDA: isnan_sparse - SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr autogen: isnan.out tags: [core, pointwise] - func: is_distributed(Tensor self) -> bool variants: function, method @@ -3379,10 +3398,14 @@ - func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int) - func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor +- func: _wrapped_linear_prepack(Tensor weight, Tensor weight_scale, Tensor weight_zero_point, Tensor bias) -> Tensor + +- func: _wrapped_quantized_linear_prepacked(Tensor input, Tensor input_scale, Tensor input_zero_point, Tensor packed_weight, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor + - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor - func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor - func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor @@ -3485,31 +3508,31 @@ device_check: NoCheck # TensorIterator structured_delegate: log1p.out variants: function, method dispatch: SparseCPU, SparseCUDA: log1p_sparse - SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr tags: [core, pointwise] - func: log1p_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: log1p.out variants: function, method dispatch: SparseCPU, SparseCUDA: log1p_sparse_ - SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_ tags: pointwise - func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: log1p_out MPS: log1p_out_mps SparseCPU, SparseCUDA: log1p_sparse_out - SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out tags: pointwise - func: log2(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: log2.out @@ -3897,15 +3920,14 @@ dispatch: CompositeExplicitAutograd: mean tags: core # For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this. -# FIXME: fix CI jobs and re-enable this -#- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) -# device_check: NoCheck # TensorIterator -# dispatch: -# CompositeExplicitAutograd: mean_dtype_out +- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CompositeExplicitAutograd: mean_dtype_out - func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor structured_delegate: mean.out device_check: NoCheck # TensorIterator variants: function, method @@ -4093,21 +4115,21 @@ - func: mm(Tensor self, Tensor mat2) -> Tensor structured_delegate: mm.out variants: function, method dispatch: SparseCPU, SparseCUDA: _sparse_mm - SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm tags: core - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU: mm_out_cpu CUDA: mm_out_cuda MPS: mm_out_mps SparseCPU, SparseCUDA: _sparse_mm_out - SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out - func: _int_mm(Tensor self, Tensor mat2) -> Tensor dispatch: CPU: _int_mm_cpu CUDA: _int_mm_cuda @@ -4119,10 +4141,11 @@ - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor dispatch: CPU: _convert_weight_to_int4pack_cpu CUDA: _convert_weight_to_int4pack_cuda + MPS: _convert_weight_to_int4pack_mps - func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor dispatch: CPU: _weight_int4pack_mm_cpu MPS: _weight_int4pack_mm_mps @@ -4163,11 +4186,11 @@ device_check: NoCheck # TensorIterator structured_delegate: mul.out variants: function, method dispatch: SparseCPU, SparseCUDA: mul_sparse - SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr MkldnnCPU: mkldnn_mul ZeroTensor: mul_zerotensor NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor tags: [core, pointwise] @@ -4175,11 +4198,11 @@ device_check: NoCheck # TensorIterator structured_delegate: mul.out variants: method dispatch: SparseCPU, SparseCUDA: mul_sparse_ - SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_ MkldnnCPU: mkldnn_mul_ NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor tags: pointwise - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -4189,30 +4212,30 @@ dispatch: CPU, CUDA: mul_out MPS: mul_out_mps SparseCPU: mul_out_sparse_cpu SparseCUDA: mul_out_sparse_cuda - SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr MkldnnCPU: mkldnn_mul_out tags: pointwise # For C++ only, until we have conversion from C++ numbers to Tensor - func: mul.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CompositeExplicitAutograd: mul - SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_scalar_sparse_csr NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar tags: [core, pointwise] - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: mul_ - SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul__scalar_sparse_csr NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar autogen: mul.Scalar_out tags: pointwise # multiply, alias for mul @@ -4528,25 +4551,28 @@ CompositeImplicitAutograd: math_channel_shuffle - func: is_pinned(Tensor self, Device? device=None) -> bool variants: method dispatch: - NestedTensorCUDA, CUDA: is_pinned_cuda - MPS: is_pinned_mps - CompositeExplicitAutograd: is_pinned_default + # the NestedTensor keys are necessary because NestedTensor has been removed + # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys] + CompositeExplicitAutograd, NestedTensorCPU: is_pinned + SparseCsrCPU: is_pinned_sparse_compressed + SparseCPU: is_pinned_sparse_coo # TODO: add a copy kwarg that guarantees that the tensor is put into fresh # pinned memory - func: pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a) variants: method # Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor - func: _pin_memory(Tensor self, Device? device=None) -> Tensor dispatch: - CUDA: _pin_memory_cuda - MPS: _pin_memory_mps - NestedTensorCUDA, NestedTensorCPU: _pin_memory_nested + CompositeExplicitAutograd: _pin_memory + NestedTensorCPU: _pin_memory_nested + SparseCPU: _pin_memory_sparse_coo + SparseCsrCPU: _pin_memory_sparse_compressed autogen: _pin_memory.out - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor variants: function, method @@ -4556,46 +4582,46 @@ - func: rad2deg(Tensor self) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: rad2deg SparseCPU, SparseCUDA: rad2deg_sparse - SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr - func: rad2deg_(Tensor(a!) self) -> Tensor(a!) variants: function, method dispatch: CompositeExplicitAutograd: rad2deg_ SparseCPU, SparseCUDA: rad2deg_sparse_ - SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_ - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: CompositeExplicitAutograd: rad2deg_out SparseCPU, SparseCUDA: rad2deg_sparse_out - SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out - func: deg2rad(Tensor self) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: deg2rad SparseCPU, SparseCUDA: deg2rad_sparse - SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr tags: pointwise - func: deg2rad_(Tensor(a!) self) -> Tensor(a!) variants: function, method dispatch: CompositeExplicitAutograd: deg2rad_ SparseCPU, SparseCUDA: deg2rad_sparse_ - SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_ tags: pointwise - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: CompositeExplicitAutograd: deg2rad_out SparseCPU, SparseCUDA: deg2rad_sparse_out - SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out tags: pointwise - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: CompositeExplicitAutograd: scalar_tensor @@ -4809,21 +4835,21 @@ device_check: NoCheck # TensorIterator structured_delegate: neg.out variants: function, method dispatch: SparseCPU, SparseCUDA: neg_sparse - SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg tags: [core, pointwise] - func: neg_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: neg.out variants: function, method dispatch: SparseCPU, SparseCUDA: neg_sparse_ - SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_ NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_ tags: pointwise - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -4831,11 +4857,11 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: neg_out MPS: neg_out_mps SparseCPU, SparseCUDA: neg_out_sparse - SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out tags: pointwise # Alias for neg - func: negative(Tensor self) -> Tensor variants: function, method @@ -4915,20 +4941,20 @@ device_check: NoCheck # TensorIterator structured_delegate: round.out variants: function, method dispatch: SparseCPU, SparseCUDA: round_sparse - SparseCsrCPU, SparseCsrCUDA: round_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr tags: [core, pointwise] - func: round_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: round.out variants: function, method dispatch: SparseCPU, SparseCUDA: round_sparse_ - SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_ tags: pointwise - func: round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True @@ -4936,11 +4962,11 @@ dispatch: CPU: round_out CUDA: round_out MPS: round_out_mps SparseCPU, SparseCUDA: round_sparse_out - SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out tags: pointwise - func: round.decimals(Tensor self, *, int decimals) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: round.decimals_out @@ -4979,11 +5005,11 @@ MkldnnCPU: mkldnn_relu QuantizedCPU: relu_quantized_cpu QuantizedCUDA: relu_quantized_cuda NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu SparseCPU, SparseCUDA: relu_sparse - SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr tags: [core, pointwise] - func: relu_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method @@ -4993,11 +5019,11 @@ MkldnnCPU: mkldnn_relu_ QuantizedCPU: relu_quantized_cpu_ QuantizedCUDA: relu_quantized_cuda_ NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_ SparseCPU, SparseCUDA: relu_sparse_ - SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_ autogen: relu.out tags: pointwise - func: relu6(Tensor self) -> Tensor python_module: nn @@ -5126,11 +5152,11 @@ variants: function, method device_check: NoCheck device_guard: False dispatch: CompositeExplicitAutograd: select_symint - SparseCsrCPU, SparseCsrCUDA: select_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: select_sparse_csr NestedTensorCPU, NestedTensorCUDA: select_nested tags: core - func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor variants: function @@ -5275,32 +5301,32 @@ - func: sin(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: sin.out variants: function, method dispatch: - SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr SparseCPU, SparseCUDA: sin_sparse NestedTensorCPU, NestedTensorCUDA: sin_nested tags: [core, pointwise] - func: sin_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: sin.out variants: function, method dispatch: - SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_ SparseCPU, SparseCUDA: sin_sparse_ tags: pointwise - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sin_out MPS: sin_out_mps - SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out SparseCPU, SparseCUDA: sin_sparse_out tags: pointwise - func: sinc(Tensor self) -> Tensor structured_delegate: sinc.out @@ -5323,31 +5349,31 @@ device_check: NoCheck # TensorIterator structured_delegate: sinh.out variants: function, method dispatch: SparseCPU, SparseCUDA: sinh_sparse - SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr tags: [core, pointwise] - func: sinh_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: sinh.out variants: function, method dispatch: SparseCPU, SparseCUDA: sinh_sparse_ - SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_ tags: pointwise - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sinh_out MPS: sinh_out_mps SparseCPU, SparseCUDA: sinh_sparse_out - SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out # Returns a copy of this `Variable` that is detached from its autograd graph. # This method is OK to call if the `Variable` is a view. # # NOTE: Previously, if we change the tensor metadata (e.g. sizes / strides / @@ -5730,11 +5756,11 @@ device_check: NoCheck # TensorIterator variants: function, method dispatch: NestedTensorCPU: NestedTensor_sum_dim_CPU SparseCPU, SparseCUDA: sum_sparse_coo - SparseCsrCPU, SparseCsrCUDA: sum_sparse_compressed + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed tags: core - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -5776,31 +5802,31 @@ device_check: NoCheck # TensorIterator structured_delegate: sqrt.out variants: function, method dispatch: SparseCPU, SparseCUDA: sqrt_sparse - SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr tags: [core, pointwise] - func: sqrt_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: sqrt.out variants: function, method dispatch: SparseCPU, SparseCUDA: sqrt_sparse_ - SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_ tags: pointwise - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sqrt_out MPS: sqrt_out_mps SparseCPU, SparseCUDA: sqrt_sparse_out - SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out tags: pointwise - func: square(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -5934,53 +5960,53 @@ device_check: NoCheck # TensorIterator structured_delegate: tan.out variants: function, method dispatch: SparseCPU, SparseCUDA: tan_sparse - SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr tags: [core, pointwise] - func: tan_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: tan.out variants: function, method dispatch: SparseCPU, SparseCUDA: tan_sparse_ - SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_ tags: pointwise - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: tan_out MPS: tan_out_mps SparseCPU, SparseCUDA: tan_sparse_out - SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out tags: pointwise - func: tanh(Tensor self) -> Tensor device_check: NoCheck # TensorIterator structured_delegate: tanh.out variants: function, method dispatch: QuantizedCPU: tanh_quantized_cpu MkldnnCPU: mkldnn_tanh SparseCPU, SparseCUDA: tanh_sparse - SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh tags: [core, pointwise] - func: tanh_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: tanh.out variants: function, method dispatch: MkldnnCPU: mkldnn_tanh_ SparseCPU, SparseCUDA: tanh_sparse_ - SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_ NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_ tags: pointwise - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -5988,11 +6014,11 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: tanh_out MPS: tanh_out_mps SparseCPU, SparseCUDA: tanh_sparse_out - SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out tags: pointwise - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor variants: function @@ -6025,19 +6051,19 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: threshold_backward_out MPS: threshold_backward_out_mps SparseCPU, SparseCUDA: threshold_backward_sparse_out - SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: threshold_backward_sparse_compressed_out - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor variants: function structured_delegate: threshold_backward.grad_input dispatch: MkldnnCPU: mkldnn_relu_backward SparseCPU, SparseCUDA: threshold_backward_sparse - SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: threshold_backward_sparse_compressed NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested tags: pointwise - func: tile(Tensor self, SymInt[] dims) -> Tensor variants: function, method @@ -6183,16 +6209,16 @@ tags: view_copy dispatch: CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy autogen: _nested_view_from_buffer_copy.out -- func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a) +- func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1, Tensor? min_seqlen=None, Tensor? max_seqlen=None) -> Tensor(a) variants: function device_check: NoCheck dispatch: {} -- func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor +- func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1, Tensor? min_seqlen=None, Tensor? max_seqlen=None) -> Tensor variants: function device_check: NoCheck tags: view_copy dispatch: CompositeExplicitAutogradNonFunctional: _nested_view_from_jagged_copy @@ -6225,10 +6251,20 @@ - func: _nested_get_ragged_idx(Tensor self) -> int variants: function device_check: NoCheck dispatch: {} +- func: _nested_get_min_seqlen(Tensor self) -> Tensor + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_get_max_seqlen(Tensor self) -> Tensor + variants: function + device_check: NoCheck + dispatch: {} + - func: _nested_get_jagged_dummy(Tensor any) -> Tensor category_override: dummy dispatch: {} - func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor) @@ -6249,31 +6285,31 @@ structured_delegate: trunc.out device_check: NoCheck # TensorIterator variants: function, method dispatch: SparseCPU, SparseCUDA: trunc_sparse - SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr tags: [core, pointwise] - func: trunc_(Tensor(a!) self) -> Tensor(a!) structured_delegate: trunc.out device_check: NoCheck # TensorIterator variants: function, method dispatch: SparseCPU, SparseCUDA: trunc_sparse_ - SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_ tags: pointwise - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: trunc_out MPS: trunc_out_mps SparseCPU, SparseCUDA: trunc_sparse_out - SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out tags: pointwise # Alias for trunc - func: fix(Tensor self) -> Tensor variants: function, method @@ -6441,10 +6477,11 @@ - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA, MPS: where + NestedTensorCPU, NestedTensorCUDA: NestedTensor_where tags: [core, pointwise] - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: @@ -6778,11 +6815,11 @@ - func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor variants: function, method dispatch: CompositeExplicitAutograd: clone SparseCPU, SparseCUDA: clone_sparse - SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed MkldnnCPU: mkldnn_clone QuantizedCPU, QuantizedCUDA: quantized_clone NestedTensorCPU, NestedTensorCUDA: clone_nested autogen: clone.out tags: [core, pointwise] @@ -6802,11 +6839,11 @@ - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!) use_const_ref_for_mutable_tensors: True variants: function, method dispatch: SparseCPU, SparseCUDA: resize_as_sparse_ - SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_compressed_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: resize_as_sparse_compressed_ autogen: resize_as_sparse, resize_as_sparse.out - func: zero_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method, function @@ -6960,11 +6997,11 @@ structured_delegate: addmm.out variants: function, method dispatch: SparseCPU: addmm_sparse_dense_cpu SparseCUDA: addmm_sparse_dense_cuda - SparseCsrCPU, SparseCsrCUDA: addmm_sparse_compressed_dense + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense tags: core - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) structured_delegate: addmm.out variants: method @@ -6982,16 +7019,16 @@ - func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor structured_delegate: _addmm_activation.out variants: function, method -- func: _scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False) -> (Tensor, Tensor) +- func: _scaled_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor variants: function dispatch: CUDA: _scaled_mm_cuda -- func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!)) +- func: _scaled_mm.out(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: CUDA: _scaled_mm_out_cuda # NOTE [ Sparse: autograd and API ] @@ -7182,11 +7219,11 @@ - func: sparse_mask(Tensor self, Tensor mask) -> Tensor variants: method dispatch: SparseCPU, SparseCUDA: sparse_mask - SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_compressed + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_mask_sparse_compressed autogen: sparse_mask.out - func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor variants: method dispatch: @@ -7202,11 +7239,11 @@ # Special case of to_dense with custom derivative - func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor variants: method dispatch: SparseCPU, SparseCUDA: sparse_to_dense - SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense MkldnnCPU: mkldnn_to_dense autogen: _to_dense.out - func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor @@ -7383,11 +7420,11 @@ - func: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor variants: method dispatch: CPU, CUDA: dense_to_sparse SparseCPU, SparseCUDA: sparse_coo_to_sparse - SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_sparse autogen: _to_sparse.sparse_dim_out - func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor variants: method @@ -7395,11 +7432,11 @@ - func: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor variants: method dispatch: CPU, CUDA: dense_to_sparse SparseCPU, SparseCUDA: sparse_coo_to_sparse - SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_sparse autogen: _to_sparse.out - func: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor variants: method @@ -7407,11 +7444,11 @@ - func: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor variants: method dispatch: CPU, CUDA: dense_to_sparse_csr SparseCPU, SparseCUDA: coo_to_sparse_csr - SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_sparse_csr autogen: _to_sparse_csr.out - func: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor variants: method @@ -7419,11 +7456,11 @@ - func: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor variants: method dispatch: CPU, CUDA: dense_to_sparse_csc SparseCPU, SparseCUDA: coo_to_sparse_csc - SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_sparse_csc autogen: _to_sparse_csc.out - func: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor variants: method @@ -7431,11 +7468,11 @@ - func: _to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor variants: method dispatch: CPU, CUDA: dense_to_sparse_bsr SparseCPU, SparseCUDA: coo_to_sparse_bsr - SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_sparse_bsr autogen: _to_sparse_bsr.out - func: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor variants: method @@ -7443,11 +7480,11 @@ - func: _to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor variants: method dispatch: CPU, CUDA: dense_to_sparse_bsc SparseCPU, SparseCUDA: coo_to_sparse_bsc - SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_sparse_bsc autogen: _to_sparse_bsc.out - func: _to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor) variants: function dispatch: @@ -8429,33 +8466,33 @@ - func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: __lshift__ + CPU, CUDA, MPS: __lshift__ tags: pointwise - func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: __lshift__ + CPU, CUDA, MPS: __lshift__ tags: pointwise - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: - CPU, CUDA: __ilshift__ + CPU, CUDA, MPS: __ilshift__ autogen: __lshift__.Scalar_out tags: pointwise - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: - CPU, CUDA: __ilshift__ + CPU, CUDA, MPS: __ilshift__ autogen: __lshift__.Tensor_out tags: pointwise - func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator @@ -8472,11 +8509,11 @@ - func: bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: bitwise_left_shift_out + CPU, CUDA, MPS: bitwise_left_shift_out tags: pointwise - func: bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -8508,32 +8545,32 @@ - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: __rshift__ + CPU, CUDA, MPS: __rshift__ tags: pointwise - func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: __rshift__ + CPU, CUDA, MPS: __rshift__ tags: pointwise - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: - CPU, CUDA: __irshift__ + CPU, CUDA, MPS: __irshift__ autogen: __rshift__.Scalar_out - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: - CPU, CUDA: __irshift__ + CPU, CUDA, MPS: __irshift__ autogen: __rshift__.Tensor_out - func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -8549,11 +8586,11 @@ - func: bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: bitwise_right_shift_out + CPU, CUDA, MPS: bitwise_right_shift_out tags: pointwise - func: bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -8856,10 +8893,11 @@ structured_delegate: eq.Tensor_out device_check: NoCheck # TensorIterator variants: method, function dispatch: QuantizedCPU: eq_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: eq_tensor_nested tags: [core, pointwise] - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase @@ -9500,31 +9538,31 @@ device_check: NoCheck # TensorIterator structured_delegate: erfinv.out variants: method, function dispatch: SparseCPU, SparseCUDA: erfinv_sparse - SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr tags: pointwise - func: erfinv_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: erfinv.out variants: method dispatch: SparseCPU, SparseCUDA: erfinv_sparse_ - SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_ tags: pointwise - func: erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: erfinv_out MPS: erfinv_out_mps SparseCPU, SparseCUDA: erfinv_sparse_out - SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out tags: pointwise - func: i0(Tensor self) -> Tensor structured_delegate: i0.out variants: function, method @@ -9546,50 +9584,50 @@ device_check: NoCheck # TensorIterator structured_delegate: sign.out variants: function, method dispatch: SparseCPU, SparseCUDA: sign_sparse - SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr tags: [core, pointwise] - func: sign_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator structured_delegate: sign.out variants: method dispatch: SparseCPU, SparseCUDA: sign_sparse_ - SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_ tags: pointwise - func: sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sign_out MPS: sign_out_mps SparseCPU, SparseCUDA: sign_sparse_out - SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out tags: pointwise - func: signbit(Tensor self) -> Tensor variants: function, method structured_delegate: signbit.out dispatch: SparseCPU, SparseCUDA: signbit_sparse - SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr tags: pointwise - func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU: signbit_out CUDA: signbit_out MPS: signbit_out_mps SparseCPU, SparseCUDA: signbit_sparse_out - SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out tags: pointwise - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -10036,14 +10074,15 @@ variants: method, function - func: argsort.stable(Tensor self, *, bool stable, int dim=-1, bool descending=False) -> Tensor device_check: NoCheck # TensorIterator variants: method, function - dispatch: - CPU, CUDA, MPS: argsort_stable - autogen: argsort.stable_out +- func: argsort.stable_out(Tensor self, *, bool stable, int dim=-1, bool descending=False, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + variants: function + - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor variants: method, function - func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) structured: True @@ -10218,11 +10257,11 @@ variants: method dispatch: CPU, CUDA: normal_ MPS: normal_mps_ Meta: normal_meta_ - SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: normal_sparse_csr_ NestedTensorCPU, NestedTensorCUDA: normal_nested_ autogen: normal.out # Only used by the functionalization pass. # Normally, the codegen would be able to generate a normal() NativeFunction, @@ -13022,11 +13061,11 @@ device_guard: False dispatch: CompositeExplicitAutograd: isinf SparseCPU, SparseCUDA: isinf_sparse SparseMeta: isinf_sparse_meta - SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr autogen: isinf.out tags: [core, pointwise] - func: record_stream(Tensor(a!) self, Stream s) -> () variants: method @@ -13036,37 +13075,37 @@ - func: isposinf(Tensor self) -> Tensor variants: function, method structured_delegate: isposinf.out dispatch: SparseCPU, SparseCUDA: isposinf_sparse - SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr tags: pointwise - func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: isposinf_out SparseCPU, SparseCUDA: isposinf_sparse_out - SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out tags: pointwise - func: isneginf(Tensor self) -> Tensor variants: function, method structured_delegate: isneginf.out dispatch: SparseCPU, SparseCUDA: isneginf_sparse - SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr tags: pointwise - func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: isneginf_out SparseCPU, SparseCUDA: isneginf_sparse_out - SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out tags: pointwise # NOTE [_add_batch_dim and _remove_batch_dim] # _add_batch_dim and _remove_batch_dim are meant to be used in the implementation # of the vmap frontend API (see torch/_vmap_internals.py). They are not @@ -13785,14 +13824,20 @@ # linalg.lu_factor - func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots) python_module: linalg variants: function + dispatch: + CompositeImplicitAutograd: linalg_lu_factor + MPS: linalg_lu_factor_mps - func: linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots) python_module: linalg variants: function + dispatch: + CompositeImplicitAutograd: linalg_lu_factor_out + MPS: linalg_lu_factor_out_mps - func: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info) python_module: linalg structured_delegate: linalg_lu_factor_ex.out variants: function @@ -14174,10 +14219,15 @@ python_module: linalg - func: linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor python_module: linalg +- func: _spsolve(Tensor A, Tensor B, *, bool left=True) -> Tensor + python_module: sparse + dispatch: + SparseCsrCUDA: _sparse_csr_linear_solve + - func: linalg_solve.out(Tensor A, Tensor B, *, bool left=True, Tensor(a!) out) -> Tensor(a!) python_module: linalg - func: linalg_tensorinv(Tensor self, int ind=2) -> Tensor python_module: linalg @@ -14350,11 +14400,11 @@ variants: function dispatch: CPU, CUDA: _segment_reduce_backward_kernel autogen: _segment_reduce_backward.out -- func: pad_sequence(Tensor[] sequences, bool batch_first=False, float padding_value=0.0) -> Tensor +- func: pad_sequence(Tensor[] sequences, bool batch_first=False, float padding_value=0.0, str padding_side="right") -> Tensor python_module: nn variants: function - func: flatten_dense_tensors(Tensor[] tensors) -> Tensor variants: function @@ -14456,11 +14506,11 @@ - func: select_copy.int(Tensor self, int dim, SymInt index) -> Tensor variants: function dispatch: CompositeExplicitAutogradNonFunctional: select_copy_symint - SparseCsrCPU, SparseCsrCUDA: select_copy_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: select_copy_sparse_csr tags: view_copy autogen: select_copy.int_out - func: detach_copy(Tensor self) -> Tensor variants: function @@ -14646,22 +14696,29 @@ - func: _jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value=0.0) -> Tensor variants: function dispatch: CUDA: _fbgemm_jagged_to_padded_dense_forward + CPU: _jagged_to_padded_dense_forward_cpu - func: _padded_dense_to_jagged_forward(Tensor dense, Tensor[] offsets, SymInt? total_L=None) -> Tensor variants: function dispatch: CUDA: _fbgemm_dense_to_jagged_forward_symint + CPU: _padded_dense_to_jagged_forward_cpu - func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor dispatch: NestedTensorCPU: NestedTensor_softmax_dropout NestedTensorCUDA: NestedTensor_softmax_dropout_cuda tags: nondeterministic_seeded +- func: _safe_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + dispatch: + CompositeExplicitAutograd: _safe_softmax + NestedTensorCPU, NestedTensorCUDA: _safe_softmax + # Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is. - func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor variants: function dispatch: CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward @@ -14672,28 +14729,33 @@ dispatch: CPU, NestedTensorCPU: native_multi_head_attention_cpu CUDA, NestedTensorCUDA: native_multi_head_attention_cuda autogen: _native_multi_head_attention.out -- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor +- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None, bool enable_gqa=False) -> Tensor python_module: nn variants: function autogen: scaled_dot_product_attention.out tags: nondeterministic_seeded # This aten function is kept so that we can test the choice function from Python -- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int +- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None, bool enable_gqa=False) -> int dispatch: Meta: _fused_sdp_choice_meta CPU, NestedTensorCPU: _fused_sdp_choice_cpp CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda tags: nondeterministic_seeded -- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor) +- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None, bool enable_gqa=False) -> (Tensor, Tensor) variants: function tags: nondeterministic_seeded +- func: _scaled_dot_product_attention_math_for_mps(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor) + dispatch: + MPS: _scaled_dot_product_attention_math_mps + tags: nondeterministic_seeded + - func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) dispatch: CUDA: _scaled_dot_product_flash_attention_cuda NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda tags: nondeterministic_seeded @@ -14701,10 +14763,15 @@ - func: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp) dispatch: CPU: _scaled_dot_product_flash_attention_cpu tags: nondeterministic_seeded +- func: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) + dispatch: + CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable + tags: nondeterministic_seeded + - func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value) device_check: NoCheck variants: function dispatch: CUDA: _scaled_dot_product_flash_attention_backward_cuda @@ -14714,10 +14781,16 @@ device_check: NoCheck variants: function dispatch: CPU: _scaled_dot_product_flash_attention_cpu_backward +- func: _scaled_dot_product_fused_attention_overrideable_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor attn_bias, bool[4] grad_input_mask, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value, Tensor grad_attn_bias) + device_check: NoCheck + variants: function + dispatch: + CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable_backward + - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset) dispatch: CUDA: _scaled_dot_product_efficient_attention_cuda NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda tags: nondeterministic_seeded @@ -14726,16 +14799,16 @@ device_check: NoCheck dispatch: CUDA: _scaled_dot_product_efficient_attention_backward_cuda tags: nondeterministic_seeded -- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) +- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) dispatch: CUDA: _scaled_dot_product_cudnn_attention_cuda tags: nondeterministic_seeded -- func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor) +- func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor) dispatch: CUDA: _scaled_dot_product_cudnn_attention_backward_cuda tags: nondeterministic_seeded - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) @@ -15561,55 +15634,61 @@ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). variants: function dispatch: CPU: _fused_adam_kernel_cpu_ CUDA: _fused_adam_kernel_cuda_ + MPS: _fused_adam_kernel_mps_ autogen: _fused_adam, _fused_adam.out - func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now), # but still skip the device check as the Tensor LR can be on CPU device_check: NoCheck variants: function dispatch: CPU: _fused_adam_kernel_cpu_ CUDA: _fused_adam_kernel_cuda_ + MPS: _fused_adam_kernel_mps_ autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out - func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). variants: function dispatch: CPU: _fused_adamw_kernel_cpu_ CUDA: _fused_adamw_kernel_cuda_ + MPS: _fused_adamw_kernel_mps_ autogen: _fused_adamw, _fused_adamw.out - func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now), # but still skip the device check as the Tensor LR can be on CPU device_check: NoCheck variants: function dispatch: CPU: _fused_adamw_kernel_cpu_ CUDA: _fused_adamw_kernel_cuda_ + MPS: _fused_adamw_kernel_mps_ autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out - func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). variants: function dispatch: CPU: _fused_sgd_kernel_cpu_ CUDA: _fused_sgd_kernel_cuda_ + MPS: _fused_sgd_kernel_mps_ autogen: _fused_sgd, _fused_sgd.out - func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). # but still skip the device check as the Tensor LR can be on CPU device_check: NoCheck variants: function dispatch: CPU: _fused_sgd_kernel_cpu_ CUDA: _fused_sgd_kernel_cuda_ + MPS: _fused_sgd_kernel_mps_ autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out - func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () variants: function dispatch: