-
Notifications
You must be signed in to change notification settings - Fork 569
MetalPerformancePrimitives iOS xcode27.0 b1
Alex Soto edited this page Jun 9, 2026
·
1 revision
#MetalPerformancePrimitives.framework
diff -ruN /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h
--- /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h 2026-04-18 22:46:48
+++ /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h 2026-05-24 03:49:30
@@ -120,7 +120,7 @@
"only group size 1 supported currently");
private:
- thread int2 __offset;
+ int2 __offset;
public:
convolution2d() thread : __offset(0)
diff -ruN /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h
--- /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h 2026-04-18 22:33:40
+++ /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h 2026-05-23 02:19:36
@@ -10,55 +10,77 @@
// C can be tensor_handle, tensor_offset, tensor_inline or cooperative_tensor.
// Data type combinations supported by this operation are as follows:
//
-// Left Right Destination
-// ------- ------------- -----------
-// half half half
-// half int8_t half
-// half uint8_t half
-// int8_t half half
-// uint8_t half half
-// half half float
-// half float float
-// half int8_t float
-// half uint8_t float
-// float half float
-// float float float
-// float int8_t float
-// float uint8_t float
-// int8_t half float
-// uint8_t half float
-// int8_t float float
-// uint8_t float float
-// int8_t int8_t int32_t
-// uint8_t uint8_t int32_t
-// bfloat bfloat bfloat
-// bfloat bfloat float
-// bfloat float float
-// bfloat int8_t bfloat
-// bfloat int8_t float
-// float bfloat float
-// int8_t bfloat bfloat
-// int8_t bfloat float
-// bfloat half bfloat
-// bfloat half half
-// bfloat half float
-// half bfloat bfloat
-// half bfloat half
-// half bfloat float
-// bfloat uint8_t bfloat
-// bfloat uint8_t float
-// uint8_t bfloat bfloat
-// uint8_t bfloat float
-// half int4b_format half
-// half int4b_format float
-// half uint4b_format half
-// half uint4b_format float
-// int8_t int4b_format int32_t
-// uint8_t uint4b_format int32_t
-// bfloat int4b_format bfloat
-// bfloat uint4b_format bfloat
-// bfloat int4b_format float
-// bfloat uint4b_format float
+// Left Right Destination
+// --------------------- --------------------- -----------
+// half half half
+// half int8_t half
+// half uint8_t half
+// int8_t half half
+// uint8_t half half
+// half half float
+// half float float
+// half int8_t float
+// half uint8_t float
+// float half float
+// float float float
+// float int8_t float
+// float uint8_t float
+// int8_t half float
+// uint8_t half float
+// int8_t float float
+// uint8_t float float
+// int8_t int8_t int32_t
+// uint8_t uint8_t int32_t
+// bfloat bfloat bfloat
+// bfloat bfloat float
+// bfloat float float
+// bfloat int8_t bfloat
+// bfloat int8_t float
+// float bfloat float
+// int8_t bfloat bfloat
+// int8_t bfloat float
+// bfloat half bfloat
+// bfloat half half
+// bfloat half float
+// half bfloat bfloat
+// half bfloat half
+// half bfloat float
+// bfloat uint8_t bfloat
+// bfloat uint8_t float
+// uint8_t bfloat bfloat
+// uint8_t bfloat float
+// half int4b_format half
+// half int4b_format float
+// half uint4b_format half
+// half uint4b_format float
+// int8_t int4b_format int32_t
+// uint8_t uint4b_format int32_t
+// bfloat int4b_format bfloat
+// bfloat uint4b_format bfloat
+// bfloat int4b_format float
+// bfloat uint4b_format float
+// int8_t int2b_format int32_t
+// uint8_t uint2b_format int32_t
+// half int2b_format half
+// half int2b_format float
+// half uint2b_format half
+// half uint2b_format float
+// bfloat int2b_format bfloat
+// bfloat uint2b_format bfloat
+// bfloat int2b_format float
+// bfloat uint2b_format float
+// half metal_fp4_e2m1_format half
+// half metal_fp4_e2m1_format float
+// half metal_fp8_e4m3_format half
+// half metal_fp8_e4m3_format float
+// half metal_fp8_e5m2_format half
+// half metal_fp8_e5m2_format float
+// metal_fp4_e2m1_format metal_fp4_e2m1_format half
+// metal_fp4_e2m1_format metal_fp4_e2m1_format float
+// metal_fp8_e4m3_format metal_fp8_e4m3_format half
+// metal_fp8_e4m3_format metal_fp8_e4m3_format float
+// metal_fp8_e5m2_format metal_fp8_e5m2_format half
+// metal_fp8_e5m2_format metal_fp8_e5m2_format float
//
// Basic usage is in the following example which takes M x K matrix A of type
// half, K x N matrix B of type half, both in device memory and produces M x N
@@ -249,7 +271,7 @@
// // be valid. Use the valid element check shown below to guard
// // access to elements of cooperative_tensor
//
-// auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB), float>();
+// auto cT = matmulOp.get_destination_cooperative_tensor<__remove_addrspace_t<decltype(mA)>, __remove_addrspace_t<decltype(mB)>, float>();
//
// // Loop over all the elements of cooperative_tensor thread elements owned
// // by "this" thread and initialize to zero.
@@ -268,7 +290,7 @@
//
// // create cooperative bias tensor with same layout as destination
// // cooperative_tensor of matmul
-// auto biasT = matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB), float>();
+// auto biasT = matmulOp.get_destination_cooperative_tensor<__remove_addrspace_t<decltype(mA)>, __remove_addrspace_t<decltype(mB)>, float>();
//
// // load data from bias tensor_handle into biasT cooperative_tensor using
// // layout and distribution of element among threads of scope on which matmul was created.
@@ -425,9 +447,9 @@
template <typename LeftElementType, typename RightElementType,
typename ElementType, typename CoordType = int,
typename U = __tensor_ops_detail::__enable_if_t<
- __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<ElementType> &&
__tensor_ops_detail::__is_integral_v<CoordType>>,
typename... CoopArgs>
INLINE cooperative_tensor_left_input_t<LeftElementType, RightElementType, ElementType, CoordType, CoopArgs...>
@@ -441,10 +463,10 @@
typename ElementType, typename CoordType = int,
typename SrcElemType, typename SrcExtents, typename SrcLayout,
typename U = __tensor_ops_detail::__enable_if_t<
- __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<SrcElemType> &&
+ __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<ElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<SrcElemType> &&
__tensor_ops_detail::__is_integral_v<CoordType>>,
typename... CoopArgs>
INLINE cooperative_tensor_left_input_t<LeftElementType, RightElementType, ElementType, CoordType, CoopArgs...>
@@ -458,10 +480,10 @@
template <typename LeftElementType, typename RightElementType, typename ElementType,
typename SrcElemType, typename SrcExtents, typename SrcLayout,
typename U = __tensor_ops_detail::__enable_if_t<
- __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<SrcElemType>>>
+ __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<ElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<SrcElemType>>>
INLINE bool
is_compatible_as_left_input(const thread metal::cooperative_tensor<SrcElemType, SrcExtents, SrcLayout> & src) thread const
{
@@ -478,9 +500,9 @@
template <typename LeftElementType, typename RightElementType,
typename ElementType, typename CoordType = int,
typename U = __tensor_ops_detail::__enable_if_t<
- __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<ElementType> &&
__tensor_ops_detail::__is_integral_v<CoordType>>,
typename... CoopArgs>
INLINE cooperative_tensor_right_input_t<LeftElementType, RightElementType, ElementType, CoordType, CoopArgs...>
@@ -494,10 +516,10 @@
typename ElementType, typename CoordType = int,
typename SrcElemType, typename SrcExtents, typename SrcLayout,
typename U = __tensor_ops_detail::__enable_if_t<
- __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<SrcElemType> &&
+ __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<ElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<SrcElemType> &&
__tensor_ops_detail::__is_integral_v<CoordType>>,
typename... CoopArgs>
INLINE cooperative_tensor_right_input_t<LeftElementType, RightElementType, ElementType, CoordType, CoopArgs...>
@@ -511,10 +533,10 @@
template <typename LeftElementType, typename RightElementType, typename ElementType,
typename SrcElemType, typename SrcExtents, typename SrcLayout,
typename U = __tensor_ops_detail::__enable_if_t<
- __tensor_ops_detail::__is_thread_addrspace_v<LeftElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<RightElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
- __tensor_ops_detail::__is_thread_addrspace_v<SrcElemType>>>
+ __tensor_ops_detail::__is_unqualified_v<LeftElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<RightElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<ElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<SrcElemType>>>
INLINE bool
is_compatible_as_right_input(const thread metal::cooperative_tensor<SrcElemType, SrcExtents, SrcLayout> & src) thread const
{
@@ -533,7 +555,7 @@
typename U = __tensor_ops_detail::__enable_if_t<
(__tensor_ops_detail::__is_tensor_type_v<LeftOperandType> || __tensor_ops_detail::__is_cooperative_tensor_type_v<LeftOperandType>) &&
(__tensor_ops_detail::__is_tensor_type_v<RightOperandType> || __tensor_ops_detail::__is_cooperative_tensor_type_v<RightOperandType>) &&
- __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
+ __tensor_ops_detail::__is_unqualified_v<ElementType> &&
__tensor_ops_detail::__is_integral_v<CoordType>>,
typename... CoopArgs>
INLINE cooperative_tensor_destination_t<LeftOperandType, RightOperandType, ElementType, CoordType, CoopArgs...>
diff -ruN /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsAvailability.h /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsAvailability.h
--- /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsAvailability.h 2026-04-18 21:24:13
+++ /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsAvailability.h 2026-05-23 02:54:37
@@ -8,5 +8,6 @@
#define __MetalTensorOpsAvailability__
#define __TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_26_2 ((__ENVIRONMENT_OS_VERSION_MIN_REQUIRED__) >= 260200)
+#define __TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_27_0 ((__ENVIRONMENT_OS_VERSION_MIN_REQUIRED__) >= 270000)
#endif
diff -ruN /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h
--- /Applications/Xcode_26.5.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h 2026-04-18 22:33:40
+++ /Applications/Xcode_27.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h 2026-05-22 05:10:45
@@ -215,6 +215,28 @@
__tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype,
int);
+#if __TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_27_0
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_tensor_copy_construct(
+ __matmul2d_cooperative_operand_index,
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_tensor_move_construct(
+ __matmul2d_cooperative_operand_index,
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int);
+#endif
#if !__TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_26_2
extern "C" EXTERNALLY_DEFINED_ATTR void
@@ -713,7 +735,7 @@
__tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype);
-extern "C" EXTERNALLY_DEFINED_ATTR thread uint16_t
+extern "C" EXTERNALLY_DEFINED_ATTR uint16_t
__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_get_element_index(
__matmul2d_descriptor,
__tensor_ops_detail::__const_thread_void_t,
@@ -763,6 +785,28 @@
uint16_t,
__tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype);
+#if __TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_27_0
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_copy_construct(
+ __matmul2d_descriptor,
+ int,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_move_construct(
+ __matmul2d_descriptor,
+ int,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int);
+#endif
extern "C" EXTERNALLY_DEFINED_ATTR bool
__tensorops_impl_matmul2d_op_cooperative_destination_is_iterator_compatible(
__matmul2d_descriptor,
@@ -1012,6 +1056,1844 @@
__tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype);
+#if __TENSOR_OPS_SUPPORT_DEPLOYMENT_TARGET_27_0
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f16_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_ui8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_i8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_dv_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_dv_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_ui8_tg_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_tg_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_ui8_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_dv_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_dv_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_ui8_tg_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_tg_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_ui8_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_dv_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_tg_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_dv_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_tg_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_ui8_ui8_i32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f32_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_i8_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_f32_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_dv_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_tg_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_i8_b16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_dv_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_tg_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_b16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_dv_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_tg_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_dv_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_tg_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_b16_f16_f16_v2(thread matmul2d_descriptor & desc, thread void *left, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_tg_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int destinationStaticExtent0, int destinationStaticExtent1, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_f16_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f32_v2(thread matmul2d_descriptor & desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, __tensor_ops_detail::__tensor_ops_datatype leftScaleDataType, int leftScaleBlockSize0, int leftScaleBlockSize1, int leftStaticExtent0, int leftStaticExtent1, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, __tensor_ops_detail::__tensor_ops_datatype rightScaleDataType, int rightScaleBlockSize0, int rightScaleBlockSize1, int rightStaticExtent0, int rightStaticExtent1, thread vo