20 typename AScaleDataType,
22 typename BScaleDataType,
25 typename AMmaTileDesc,
26 typename BMmaTileDesc,
27 index_t ABlockTransferSrcScalarPerVector,
28 index_t BBlockTransferSrcScalarPerVector,
41template <
index_t ThreadBlockSize,
44 typename AScaleDataType,
46 typename BScaleDataType,
49 typename AMmaTileDesc,
50 typename BMmaTileDesc,
51 index_t ABlockTransferSrcScalarPerVector,
52 index_t BBlockTransferSrcScalarPerVector,
72 ABlockTransferSrcScalarPerVector,
73 BBlockTransferSrcScalarPerVector,
89 ABlockTransferSrcScalarPerVector,
90 BBlockTransferSrcScalarPerVector,
109 ABlockTransferSrcScalarPerVector,
110 BBlockTransferSrcScalarPerVector,
172 KPerBlock / ScaleBlockSize;
186 "A scale pack data type too large!");
188 "B scale pack data type too large!");
206 constexpr auto num_ds_read_inst_a =
213 constexpr auto num_buffer_load_stage1 =
216 constexpr auto num_buffer_load_stage2 = num_buffer_load_inst_a;
221 constexpr auto ds_read_a_issue_cycle =
223 constexpr auto ds_read_a_mfma_rate =
229 constexpr auto num_total_stages = std::max(2, MRepeat);
231 if constexpr(num_total_stages > 2)
235 constexpr auto num_mfma_perstage = num_mfma_inst / num_total_stages;
236 constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
238 constexpr auto num_ds_read_a_mfma_perstage =
241 constexpr auto num_ds_read_a_prefetch_stages = 2;
243 constexpr auto buffer_load_perstage_more =
245 constexpr auto buffer_load_perstage_less =
247 constexpr auto buffer_load_perstage_stage2 =
250 constexpr auto buffer_load_stages_more =
251 num_buffer_load_stage1 -
253 ((num_total_stages - 2));
255 constexpr auto buffer_load_issue_point_interval_more =
256 num_mfma_perstage / buffer_load_perstage_more;
257 constexpr auto buffer_load_issue_point_interval_less =
258 num_mfma_perstage / buffer_load_perstage_less;
259 constexpr auto buffer_load_issue_point_interval_stage2 =
260 num_mfma_perstage / buffer_load_perstage_stage2;
266 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
268 if constexpr(imfma % buffer_load_issue_point_interval_more == 0)
270 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
273 if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
275 __builtin_amdgcn_sched_group_barrier(
276 0x100, ds_read_a_mfma_rate, 0);
282 static_for<0, (num_total_stages - 2 - buffer_load_stages_more), 1>{}([&](
auto ) {
284 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
285 if constexpr(imfma % buffer_load_issue_point_interval_less == 0)
287 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
289 if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
291 __builtin_amdgcn_sched_group_barrier(
292 0x100, ds_read_a_mfma_rate, 0);
301 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
302 if constexpr(imfma % buffer_load_issue_point_interval_stage2 == 0)
304 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
306 if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
308 __builtin_amdgcn_sched_group_barrier(
309 0x100, ds_read_a_mfma_rate, 0);
316 constexpr auto num_buffer_load_total = num_buffer_load_inst_a + num_buffer_load_inst_b +
320 num_ds_read_inst_a, ds_read_a_mfma_rate);
323 constexpr auto num_mfma_stage1 = num_mfma_inst - num_dsread_a_mfma;
325 constexpr auto mfma_perstage_more =
327 constexpr auto mfma_perstage_less =
330 constexpr auto mfma_stages_more =
331 num_mfma_stage1 - mfma_perstage_less * num_buffer_load_total;
334 if constexpr(i < mfma_stages_more)
337 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
339 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
344 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
346 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
351 if constexpr((i + num_buffer_load_inst_a) < mfma_stages_more)
354 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
356 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
361 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
363 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
368 if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b) <
372 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
374 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
379 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
381 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
386 if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b +
390 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
392 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
397 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
399 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
405 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
406 if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
409 __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0);
413 __builtin_amdgcn_sched_group_barrier(
415 num_ds_read_inst_a - (num_dsread_a_mfma - 1) * ds_read_a_mfma_rate,
422 template <
bool HasMainLoop,
426 typename ABlockTransfer,
427 typename AGridBuffer,
428 typename ABlockBuffer,
429 typename ABlockTransferStep,
432 typename BBlockTransfer,
433 typename BGridBuffer,
434 typename BBlockBuffer,
435 typename BBlockTransferStep,
436 typename CThreadBuffer,
437 typename AScaleGridBuffer,
438 typename AScaleGridDesc,
439 typename AScaleThreadTransfer,
440 typename BScaleGridBuffer,
441 typename BScaleGridDesc,
442 typename BScaleThreadTransfer>
445 const AGridDesc& a_grid_desc,
446 const ABlockDesc& a_block_desc,
447 ABlockTransfer& a_blockwise_copy,
448 const AGridBuffer& a_grid_buf,
449 ABlockBuffer& a_block_bufs,
450 const ABlockTransferStep& a_block_copy_step,
452 const BGridDesc& b_grid_desc,
453 const BBlockDesc& b_block_desc,
454 BBlockTransfer& b_blockwise_copy,
455 const BGridBuffer& b_grid_buf,
456 BBlockBuffer& b_block_bufs,
457 const BBlockTransferStep& b_block_copy_step,
459 CThreadBuffer& c_thread_buf,
461 const AScaleGridDesc& a_scale_grid_desc,
462 AScaleThreadTransfer& a_scale_thread_copy,
463 const AScaleGridBuffer& a_scale_grid_buf,
464 const BScaleGridDesc& b_scale_grid_desc,
465 BScaleThreadTransfer& b_scale_thread_copy,
466 const BScaleGridBuffer& b_scale_grid_buf,
487 a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(
I0));
488 b_blockwise_copy.Run(
489 b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(
I0));
491 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
492 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
497 a_scale_thread_copy.Run(a_scale_grid_desc,
501 a_scale_thread_bufs(
I0));
503 a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
506 a_scale_thread_copy.MoveSrcSliceWindow(
511 a_scale_thread_copy.MoveSrcSliceWindow(
518 b_scale_thread_copy.Run(b_scale_grid_desc,
522 b_scale_thread_bufs(
I0));
524 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
527 b_scale_thread_copy.MoveSrcSliceWindow(
533 b_scale_thread_copy.MoveSrcSliceWindow(
546 constexpr auto a_k_step_chunk =
562 a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(
I1));
563 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
566 c_thread_buf.Clear();
567 __builtin_amdgcn_sched_barrier(0);
570 if constexpr(HasMainLoop)
576 auto LoopFunc = [&](
auto scale_comp_buf,
auto scale_mem_buf) {
577 b_blockwise_copy.Run(b_grid_desc,
581 b_thread_bufs(scale_mem_buf));
586 a_scale_thread_copy.Run(a_scale_grid_desc,
590 a_scale_thread_bufs(scale_mem_buf));
592 a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
595 a_scale_thread_copy.MoveSrcSliceWindow(
600 a_scale_thread_copy.MoveSrcSliceWindow(
607 b_scale_thread_copy.Run(b_scale_grid_desc,
611 b_scale_thread_bufs(scale_mem_buf));
613 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
616 b_scale_thread_copy.MoveSrcSliceWindow(
622 b_scale_thread_copy.MoveSrcSliceWindow(
627 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
630 constexpr auto im_major = m0 /
MXdlPack;
631 constexpr auto im_minor = m0 %
MXdlPack;
633 constexpr auto ik_major = k0 /
KXdlPack;
634 constexpr auto ik_minor = k0 %
KXdlPack;
636 constexpr auto in_major = n0 /
NXdlPack;
637 constexpr auto in_minor = n0 %
NXdlPack;
639 constexpr index_t a_scale_offset =
642 constexpr index_t b_scale_offset =
647 "Must have at least one scale per Xdlops "
657 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
663 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
672 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
675 b_thread_vec.template AsType<ComputeTypeB>()(ik) = b_thread_bufs
680 using mfma_input_type_a =
685 using mfma_input_type_b =
690 using mfma_scale_input_type_a =
693 using mfma_scale_input_type_b =
698 make_tuple(im_major, in_major, im_minor, in_minor, 0));
703 a_thread_vec.template AsType<mfma_input_type_a>(),
704 a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
705 b_thread_vec.template AsType<mfma_input_type_b>(),
706 b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
711 if constexpr(m0.value == SwitchM)
715 a_blockwise_copy.Run(a_grid_desc,
718 a_block_bufs(scale_comp_buf));
719 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
722 constexpr auto lds_buf =
723 m0.value >= SwitchM ? scale_mem_buf : scale_comp_buf;
730 1>{}([&](
auto chunk) {
731 constexpr auto a_k_step_chunk =
755 __builtin_amdgcn_sched_barrier(0);
762 }
while(i < (num_loop - 2));
768 b_blockwise_copy.Run(
769 b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(
I1));
774 a_scale_thread_copy.Run(a_scale_grid_desc,
778 a_scale_thread_bufs(
I1));
780 a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
783 a_scale_thread_copy.MoveSrcSliceWindow(
790 b_scale_thread_copy.Run(b_scale_grid_desc,
794 b_scale_thread_bufs(
I1));
796 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
799 b_scale_thread_copy.MoveSrcSliceWindow(
804 constexpr auto im_major = m0 /
MXdlPack;
805 constexpr auto im_minor = m0 %
MXdlPack;
807 constexpr auto ik_major = k0 /
KXdlPack;
808 constexpr auto ik_minor = k0 %
KXdlPack;
810 constexpr auto in_major = n0 /
NXdlPack;
811 constexpr auto in_minor = n0 %
NXdlPack;
813 constexpr index_t a_scale_offset =
815 constexpr index_t b_scale_offset =
819 "Must have at least one scale per Xdlops "
827 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
832 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
840 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
843 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
848 using mfma_input_type_a =
852 using mfma_input_type_b =
856 using mfma_scale_input_type_a =
858 using mfma_scale_input_type_b =
862 make_tuple(im_major, in_major, im_minor, in_minor, 0));
867 a_thread_vec.template AsType<mfma_input_type_a>(),
868 a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
869 b_thread_vec.template AsType<mfma_input_type_b>(),
870 b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
874 if constexpr(m0.value == SwitchM)
880 constexpr auto lds_buf = m0.value >= SwitchM ?
I1 :
I0;
887 constexpr auto a_k_step_chunk =
908 constexpr auto im_major = m0 /
MXdlPack;
909 constexpr auto im_minor = m0 %
MXdlPack;
911 constexpr auto ik_major = k0 /
KXdlPack;
912 constexpr auto ik_minor = k0 %
KXdlPack;
914 constexpr auto in_major = n0 /
NXdlPack;
915 constexpr auto in_minor = n0 %
NXdlPack;
917 constexpr index_t a_scale_offset =
919 constexpr index_t b_scale_offset =
923 "Must have at least one scale per Xdlops "
931 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
936 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
944 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
947 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
952 using mfma_input_type_a =
956 using mfma_input_type_b =
960 using mfma_scale_input_type_a =
962 using mfma_scale_input_type_b =
966 make_tuple(im_major, in_major, im_minor, in_minor, 0));
971 a_thread_vec.template AsType<mfma_input_type_a>(),
972 a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
973 b_thread_vec.template AsType<mfma_input_type_b>(),
974 b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
985 constexpr auto a_k_step_chunk =
1012 constexpr auto im_major = m0 /
MXdlPack;
1013 constexpr auto im_minor = m0 %
MXdlPack;
1015 constexpr auto ik_major = k0 /
KXdlPack;
1016 constexpr auto ik_minor = k0 %
KXdlPack;
1018 constexpr auto in_major = n0 /
NXdlPack;
1019 constexpr auto in_minor = n0 %
NXdlPack;
1021 constexpr index_t a_scale_offset =
1023 constexpr index_t b_scale_offset =
1027 "Must have at least one scale per Xdlops "
1035 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
1040 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
1048 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
1051 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
1056 using mfma_input_type_a =
1060 using mfma_input_type_b =
1064 using mfma_scale_input_type_a =
1066 using mfma_scale_input_type_b =
1070 make_tuple(im_major, in_major, im_minor, in_minor, 0));
1075 a_thread_vec.template AsType<mfma_input_type_a>(),
1076 a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
1077 b_thread_vec.template AsType<mfma_input_type_b>(),
1078 b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
1089 constexpr auto a_k_step_chunk =
1136 Number<KRepeat / KXdlPack>{},
1143 Number<KRepeat / KXdlPack>{},
1149 using Base::b_thread_copy_;
1150 using Base::b_thread_desc_;
1151 using Base::c_thread_desc_;
__host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
Definition utility/math.hpp:66
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
Definition utility/math.hpp:72
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Even
Definition blkgemmpipe_scheduler.hpp:34
@ Odd
Definition blkgemmpipe_scheduler.hpp:33
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__device__ void block_sync_lds()
Definition synchronization.hpp:16
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetWaveIdx static __device__ auto GetWaveIdx()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:118
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AccType float AccType
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:36
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopInstList ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)> HotLoopInstList
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:88
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeA ADataType ComputeTypeA
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:34
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N &c_grid_desc_m_n)
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:344
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AMmaKStride static constexpr index_t AMmaKStride
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:68
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MWaves static constexpr index_t MWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:49
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 __host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:220
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:269
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_n2_n3_k static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_n3_k
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:382
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:297
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Tuple5 decltype(CalculateAThreadOriginDataIndex()) Tuple5
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:184
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:41
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:42
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:79
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::CalculateCThreadOriginDataIndex static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:154
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MXdlPack static constexpr index_t MXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:84
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 __host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:283
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N &c_grid_desc_g_m_n)
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:361
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:60
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_m3_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_m3_k
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:381
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KThreadChunk static constexpr index_t KThreadChunk
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:74
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KXdlPack static constexpr index_t KXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:86
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:234
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadBuffer __host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:116
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeB BDataType ComputeTypeB
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:35
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::APackedSize static constexpr index_t APackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:38
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockwiseGemmXdlops_mx_pipeline_base __host__ __device__ BlockwiseGemmXdlops_mx_pipeline_base(Tuple5 a_origin=CalculateAThreadOriginDataIndex(), Tuple5 b_origin=CalculateBThreadOriginDataIndex())
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:204
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BMmaKStride static constexpr index_t BMmaKStride
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:69
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NXdlPack static constexpr index_t NXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:85
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:327
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:55
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NWaves static constexpr index_t NWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:50
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::WaveSize static constexpr index_t WaveSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:51
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BPackedSize static constexpr index_t BPackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:39
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::CalculateAThreadOriginDataIndex static __device__ auto CalculateAThreadOriginDataIndex()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:130
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::A_LDS_Read_Inst_Num static constexpr index_t A_LDS_Read_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:49
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::A_LDS_Read_Width static constexpr index_t A_LDS_Read_Width
Definition blkgemmpipe_scheduler.hpp:82
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::C_MFMA_Inst_Num static constexpr index_t C_MFMA_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:54
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::C_MFMA_Inst_Cycle static constexpr index_t C_MFMA_Inst_Cycle
Definition blkgemmpipe_scheduler.hpp:105
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::A_Buffer_Load_Inst_Num static constexpr index_t A_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:39
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::B_Buffer_Load_Inst_Num static constexpr index_t B_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:41
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:42
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeA typename Base::ComputeTypeA ComputeTypeA
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:156
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MXdlPack static constexpr index_t MXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:84
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:162
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MWaves static constexpr index_t MWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:49
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_m3_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_m3_k
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:381
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:41
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerXdlopsRunPerThread static constexpr auto ScalesPerXdlopsRunPerThread
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:179
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KThreadChunk static constexpr index_t KThreadChunk
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:74
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:102
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::async_vmcnt static constexpr auto async_vmcnt
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:167
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:392
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeB typename Base::ComputeTypeB ComputeTypeB
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:157
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerXdlopsRun static constexpr auto ScalesPerXdlopsRun
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:175
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:396
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AccType typename Base::AccType AccType
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:154
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KXdlPack static constexpr index_t KXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:86
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopScheduler static __device__ constexpr auto HotLoopScheduler()
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:202
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerKBlockSize static constexpr auto ScalesPerKBlockSize
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:171
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_scale_thread_vec_size static constexpr auto b_scale_thread_vec_size
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:190
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NXdlPack static constexpr index_t NXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:85
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:1130
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::num_buffer_load_a_scale static constexpr auto num_buffer_load_a_scale
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:165
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::num_buffer_load_b_scale static constexpr auto num_buffer_load_b_scale
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:166
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::scale_pack_size_b static constexpr auto scale_pack_size_b
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:184
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:79
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:1118
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_bufs, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_bufs, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, const AScaleGridDesc &a_scale_grid_desc, AScaleThreadTransfer &a_scale_thread_copy, const AScaleGridBuffer &a_scale_grid_buf, const BScaleGridDesc &b_scale_grid_desc, BScaleThreadTransfer &b_scale_thread_copy, const BScaleGridBuffer &b_scale_grid_buf, index_t num_loop) const
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:443
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum static __host__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:197
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:60
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AThreadCopy ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeTypeA, decltype(a_block_desc_m0_m1_m2_m3_k), decltype(a_thread_desc_), Sequence< 1, 1, 1, 1, KThreadChunk >, Sequence< 0, 1, 2, 3, 4 >, 4, A_K1, A_K1 > AThreadCopy
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:1121
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::mx_scale_t e8m0_bexp_t mx_scale_t
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:182
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ARegBuf static constexpr auto ARegBuf
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:1117
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::APackedSize static constexpr index_t APackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:38
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::LocalPrefetchStages static constexpr index_t LocalPrefetchStages
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:160
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:159
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_scale_thread_vec_size static constexpr auto a_scale_thread_vec_size
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:189
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:161
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:55
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::async_vmcnt_encoding static constexpr auto async_vmcnt_encoding
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:169
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop static __host__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:192
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BPackedSize static constexpr index_t BPackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:39
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::scale_pack_size_a static constexpr auto scale_pack_size_a
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:183
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotloopLocalBufSwitch static constexpr index_t HotloopLocalBufSwitch
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:163
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_scale_thread_desc static constexpr auto a_scale_thread_desc
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:1134
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_scale_thread_desc static constexpr auto b_scale_thread_desc
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:1141
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NWaves static constexpr index_t NWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:50
ck::BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Tuple5 typename Base::Tuple5 Tuple5
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:155
Definition blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp:38
Definition utility/sequence.hpp:43
Definition threadwise_tensor_slice_transfer.hpp:1260
Unsigned representation of a conventional biased Float32 exponent.
Definition utility/e8m0.hpp:26
Definition functional2.hpp:33
Definition dtype_vector.hpp:10