Skip to content

MetalPerformancePrimitives iOS xcode27.0 b1

Alex Soto edited this page Jun 9, 2026 · 1 revision

#MetalPerformancePrimitives.framework

diff -ruN /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h
--- /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h	2026-04-18 22:46:48
+++ /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h	2026-05-24 03:49:30
@@ -120,7 +120,7 @@
                 "only group size 1 supported currently");
 
 private:
-  thread int2 __offset;
+  int2 __offset;
 
 public:
   convolution2d() thread : __offset(0)
diff -ruN /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h
--- /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h	2026-04-18 22:33:40
+++ /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h	2026-05-23 02:19:36
@@ -10,55 +10,77 @@
 // C can be tensor_handle, tensor_offset, tensor_inline or cooperative_tensor.
 // Data type combinations supported by this operation are as follows:
 //
-//  Left     Right          Destination
-//  -------  -------------  -----------
-//  half     half           half
-//  half     int8_t         half
-//  half     uint8_t        half
-//  int8_t   half           half
-//  uint8_t  half           half
-//  half     half           float
-//  half     float          float
-//  half     int8_t         float
-//  half     uint8_t        float
-//  float    half           float
-//  float    float          float
-//  float    int8_t         float
-//  float    uint8_t        float
-//  int8_t   half           float
-//  uint8_t  half           float
-//  int8_t   float          float
-//  uint8_t  float          float
-//  int8_t   int8_t         int32_t
-//  uint8_t  uint8_t        int32_t
-//  bfloat   bfloat         bfloat
-//  bfloat   bfloat         float
-//  bfloat   float          float
-//  bfloat   int8_t         bfloat
-//  bfloat   int8_t         float
-//  float    bfloat         float
-//  int8_t   bfloat         bfloat
-//  int8_t   bfloat         float
-//  bfloat   half           bfloat
-//  bfloat   half           half
-//  bfloat   half           float
-//  half     bfloat         bfloat
-//  half     bfloat         half
-//  half     bfloat         float
-//  bfloat   uint8_t        bfloat
-//  bfloat   uint8_t        float
-//  uint8_t  bfloat         bfloat
-//  uint8_t  bfloat         float
-//  half     int4b_format   half
-//  half     int4b_format   float
-//  half     uint4b_format  half
-//  half     uint4b_format  float
-//  int8_t   int4b_format   int32_t
-//  uint8_t  uint4b_format  int32_t
-//  bfloat   int4b_format   bfloat
-//  bfloat   uint4b_format  bfloat
-//  bfloat   int4b_format   float
-//  bfloat   uint4b_format  float
+//  Left                   Right                  Destination
+//  ---------------------  ---------------------  -----------
+//  half                   half                   half
+//  half                   int8_t                 half
+//  half                   uint8_t                half
+//  int8_t                 half                   half
+//  uint8_t                half                   half
+//  half                   half                   float
+//  half                   float                  float
+//  half                   int8_t                 float
+//  half                   uint8_t                float
+//  float                  half                   float
+//  float                  float                  float
+//  float                  int8_t                 float
+//  float                  uint8_t                float
+//  int8_t                 half                   float
+//  uint8_t                half                   float
+//  int8_t                 float                  float
+//  uint8_t                float                  float
+//  int8_t                 int8_t                 int32_t
+//  uint8_t                uint8_t                int32_t
+//  bfloat                 bfloat                 bfloat
+//  bfloat                 bfloat                 float
+//  bfloat                 float                  float
+//  bfloat                 int8_t                 bfloat
+//  bfloat                 int8_t                 float
+//  float                  bfloat                 float
+//  int8_t                 bfloat                 bfloat
+//  int8_t                 bfloat                 float
+//  bfloat                 half                   bfloat
+//  bfloat                 half                   half
+//  bfloat                 half                   float
+//  half                   bfloat                 bfloat
+//  half                   bfloat                 half
+//  half                   bfloat                 float
+//  bfloat                 uint8_t                bfloat
+//  bfloat                 uint8_t                float
+//  uint8_t                bfloat                 bfloat
+//  uint8_t                bfloat                 float
+//  half                   int4b_format           half
+//  half                   int4b_format           float
+//  half                   uint4b_format          half
+//  half                   uint4b_format          float
+//  int8_t                 int4b_format           int32_t
+//  uint8_t                uint4b_format          int32_t
+//  bfloat                 int4b_format           bfloat
+//  bfloat                 uint4b_format          bfloat
+//  bfloat                 int4b_format           float
+//  bfloat                 uint4b_format          float
+//  int8_t                 int2b_format           int32_t
+//  uint8_t                uint2b_format          int32_t
+//  half                   int2b_format           half
+//  half                   int2b_format           float
+//  half                   uint2b_format          half
+//  half                   uint2b_format          float
+//  bfloat                 int2b_format           bfloat
+//  bfloat                 uint2b_format          bfloat
+//  bfloat                 int2b_format           float
+//  bfloat                 uint2b_format          float
+//  half                   metal_fp4_e2m1_format  half
+//  half                   metal_fp4_e2m1_format  float
+//  half                   metal_fp8_e4m3_format  half
+//  half                   metal_fp8_e4m3_format  float
+//  half                   metal_fp8_e5m2_format  half
+//  half                   metal_fp8_e5m2_format  float
+//  metal_fp4_e2m1_format  metal_fp4_e2m1_format  half
+//  metal_fp4_e2m1_format  metal_fp4_e2m1_format  float
+//  metal_fp8_e4m3_format  metal_fp8_e4m3_format  half
+//  metal_fp8_e4m3_format  metal_fp8_e4m3_format  float
+//  metal_fp8_e5m2_format  metal_fp8_e5m2_format  half
+//  metal_fp8_e5m2_format  metal_fp8_e5m2_format  float
 //
 // Basic usage is in the following example which takes M x K matrix A of type
 // half, K x N matrix B of type half, both in device memory and produces M x N
@@ -249,7 +271,7 @@
 //    // be valid. Use the valid element check shown below to guard
 //    // access to elements of cooperative_tensor
 //
-//    auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB), float>();
+//    auto cT = matmulOp.get_destination_cooperative_tensor<__remove_addrspace_t<decltype(mA)>, __remove_addrspace_t<decltype(mB)>, float>();
 //
 //    // Loop over all the elements of cooperative_tensor thread elements owned
 //    // by "this" thread and initialize to zero.
@@ -268,7 +290,7 @@
 //
 //    // create cooperative bias tensor with same layout as destination
 //    // cooperative_tensor of matmul
-//    auto biasT = matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB), float>();
+//    auto biasT = matmulOp.get_destination_cooperative_tensor<__remove_addrspace_t<decltype(mA)>, __remove_addrspace_t<decltype(mB)>, float>();
 //
 //    // load data from bias tensor_handle into biasT cooperative_tensor using
 //    // layout and distribution of element among threads of scope on which matmul was created.
@@ -425,9 +447,9 @@
   template <typename LeftElementType, typename RightElementType,
             typename ElementType, typename CoordType = int,
             typename U = __tensor_ops_detail::__enable_if_t<
-                __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<ElementType> &&
                 __tensor_ops_detail::__is_integral_v<CoordType>>,
             typename... CoopArgs>
   INLINE cooperative_tensor_left_input_t<LeftElementType, RightElementType, ElementType, CoordType, CoopArgs...>
@@ -441,10 +463,10 @@
             typename ElementType, typename CoordType = int,
             typename SrcElemType, typename SrcExtents, typename SrcLayout,
             typename U = __tensor_ops_detail::__enable_if_t<
-                __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<SrcElemType> &&
+                __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<ElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<SrcElemType> &&
                 __tensor_ops_detail::__is_integral_v<CoordType>>,
             typename... CoopArgs>
   INLINE cooperative_tensor_left_input_t<LeftElementType, RightElementType, ElementType, CoordType, CoopArgs...>
@@ -458,10 +480,10 @@
   template <typename LeftElementType, typename RightElementType, typename ElementType,
             typename SrcElemType, typename SrcExtents, typename SrcLayout,
             typename U = __tensor_ops_detail::__enable_if_t<
-                __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<SrcElemType>>>
+                __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<ElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<SrcElemType>>>
   INLINE bool
   is_compatible_as_left_input(const thread metal::cooperative_tensor<SrcElemType, SrcExtents, SrcLayout> & src) thread const
   {
@@ -478,9 +500,9 @@
   template <typename LeftElementType, typename RightElementType,
             typename ElementType, typename CoordType = int,
             typename U = __tensor_ops_detail::__enable_if_t<
-                __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<ElementType> &&
                 __tensor_ops_detail::__is_integral_v<CoordType>>,
             typename... CoopArgs>
   INLINE cooperative_tensor_right_input_t<LeftElementType, RightElementType, ElementType, CoordType, CoopArgs...>
@@ -494,10 +516,10 @@
             typename ElementType, typename CoordType = int,
             typename SrcElemType, typename SrcExtents, typename SrcLayout,
             typename U = __tensor_ops_detail::__enable_if_t<
-                __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<SrcElemType> &&
+                __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<ElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<SrcElemType> &&
                 __tensor_ops_detail::__is_integral_v<CoordType>>,
             typename... CoopArgs>
   INLINE cooperative_tensor_right_input_t<LeftElementType, RightElementType, ElementType, CoordType, CoopArgs...>
@@ -511,10 +533,10 @@
   template <typename LeftElementType, typename RightElementType, typename ElementType,
             typename SrcElemType, typename SrcExtents, typename SrcLayout,
             typename U = __tensor_ops_detail::__enable_if_t<
-                __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
-                __tensor_ops_detail::__is_thread_addrspace_v<SrcElemType>>>
+                __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<ElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<SrcElemType>>>
   INLINE bool
   is_compatible_as_right_input(const thread metal::cooperative_tensor<SrcElemType, SrcExtents, SrcLayout> & src) thread const
   {
@@ -533,7 +555,7 @@
             typename U = __tensor_ops_detail::__enable_if_t<
                 (__tensor_ops_detail::__is_tensor_type_v<LeftOperandType> || __tensor_ops_detail::__is_cooperative_tensor_type_v<LeftOperandType>) &&
                 (__tensor_ops_detail::__is_tensor_type_v<RightOperandType> || __tensor_ops_detail::__is_cooperative_tensor_type_v<RightOperandType>) &&
-                __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
+                __tensor_ops_detail::__is_unqualified_v<ElementType> &&
                 __tensor_ops_detail::__is_integral_v<CoordType>>,
             typename... CoopArgs>
   INLINE cooperative_tensor_destination_t<LeftOperandType, RightOperandType, ElementType, CoordType, CoopArgs...>
diff -ruN /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsAvailability.h /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsAvailability.h
--- /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsAvailability.h	2026-04-18 21:24:13
+++ /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsAvailability.h	2026-05-23 02:54:37
@@ -8,5 +8,6 @@
 #define __MetalTensorOpsAvailability__
 
 #define __TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_26_2 ((__ENVIRONMENT_OS_VERSION_MIN_REQUIRED__) >= 260200)
+#define __TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_27_0 ((__ENVIRONMENT_OS_VERSION_MIN_REQUIRED__) >= 270000)
 
 #endif
diff -ruN /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h
--- /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h	2026-04-18 22:33:40
+++ /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h	2026-05-22 05:10:45
@@ -215,6 +215,28 @@
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype,
     int);
+#if __TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_27_0
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_tensor_copy_construct(
+    __matmul2d_cooperative_operand_index,
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_tensor_move_construct(
+    __matmul2d_cooperative_operand_index,
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int);
+#endif
 
 #if !__TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_26_2
 extern "C" EXTERNALLY_DEFINED_ATTR void
@@ -713,7 +735,7 @@
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype);
-extern "C" EXTERNALLY_DEFINED_ATTR thread uint16_t
+extern "C" EXTERNALLY_DEFINED_ATTR uint16_t
 __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_element_index(
     __matmul2d_descriptor,
     __tensor_ops_detail::__const_thread_void_t,
@@ -763,6 +785,28 @@
     uint16_t,
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype);
+#if __TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_27_0
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_copy_construct(
+    __matmul2d_descriptor,
+    int,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_move_construct(
+    __matmul2d_descriptor,
+    int,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int);
+#endif
 extern "C" EXTERNALLY_DEFINED_ATTR bool
 __tensorops_impl_matmul2d_op_cooperative_destination_is_iterator_compatible(
     __matmul2d_descriptor,
@@ -1012,6 +1056,1844 @@
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype);
 
+#if __TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_27_0
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread vo

Clone this wiki locally