GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight< BlockSize, FloatA, FloatB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_B_K0_M_K1, BGridDesc_B_K0_N_K1, CMNGridDesc, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, MPerBlock, NPerBlock, K0PerBlock, MPerXdl, NPerXdl, K1Value, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, ABlockLdsM1PerBlock, ABlockLdsM0PerBlock, ABlockLdsM1Padding, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, BBlockLdsN1PerBlock, BBlockLdsN0PerBlock, BBlockLdsN1Padding, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CBlockTransferScalarPerVector_NWaveNPerXDL, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, ABlockLdsExtraM1Wrw, BBlockLdsExtraN1Wrw, NumGemmKPrefetchStage, PipelineVer, ComputeTypeA, ComputeTypeB > Struct Template Reference#
ck::GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight< BlockSize, FloatA, FloatB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_B_K0_M_K1, BGridDesc_B_K0_N_K1, CMNGridDesc, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, MPerBlock, NPerBlock, K0PerBlock, MPerXdl, NPerXdl, K1Value, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, ABlockLdsM1PerBlock, ABlockLdsM0PerBlock, ABlockLdsM1Padding, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, BBlockLdsN1PerBlock, BBlockLdsN0PerBlock, BBlockLdsN1Padding, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CBlockTransferScalarPerVector_NWaveNPerXDL, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, ABlockLdsExtraM1Wrw, BBlockLdsExtraN1Wrw, NumGemmKPrefetchStage, PipelineVer, ComputeTypeA, ComputeTypeB > Struct Template Reference
#include <gridwise_gemm_xdlops_bwd_weight.hpp>
Public Types | |
| using | ThisThreadBlock = ThisThreadBlock<BlockSize> |
| using | GridwiseGemmPipe |
| using | FloatAAdjusted = conditional_t<is_same_v<ComputeTypeA, ck::tf32_t>, float, ComputeTypeA> |
| using | FloatBAdjusted = conditional_t<is_same_v<ComputeTypeB, ck::tf32_t>, float, ComputeTypeB> |
| using | CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock |
| using | CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1)) |
Static Public Member Functions | |
| __host__ static __device__ constexpr auto | GetABlockDescriptor_K0PerBlock_MPerBlock_K1 () |
| __host__ static __device__ constexpr auto | GetABlockDescriptor_Batch_K0PerBlock_MPerBlock_K1 () |
| __host__ static __device__ constexpr auto | GetBBlockDescriptor_K0PerBlock_NPerBlock_K1 () |
| __host__ static __device__ constexpr auto | GetBBlockDescriptor_Batch_K0PerBlock_NPerBlock_K1 () |
| __host__ static __device__ constexpr index_t | GetSharedMemoryNumberOfByte () |
| template<InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set> | |
| static __device__ bool constexpr | IsValidCompilationParameter () |
| template<typename Block2CTileMap> | |
| __host__ static __device__ constexpr bool | CheckValidity (const AGridDesc_B_K0_M_K1 &a_b_k0_m_k1_grid_desc, const BGridDesc_B_K0_N_K1 &b_b_k0_n_k1_grid_desc, const CMNGridDesc &c_m_n_grid_desc, const Block2CTileMap &block_2_ctile_map) |
| __host__ static __device__ constexpr bool | CalculateHasMainK0BlockLoop (index_t K0) |
| __host__ static __device__ constexpr auto | MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock (const CMNGridDesc &c_m_n_grid_desc) |
| __host__ static __device__ constexpr auto | MakeCBlockClusterAdaptor (const CMNGridDesc &c_m_n_grid_desc, index_t M01, index_t N01, index_t KBatch) |
| __host__ static __device__ constexpr auto | GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock () |
| template<bool HasMainKBlockLoop> | |
| static __device__ void | Run (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, void *__restrict__ p_shared, const AGridDesc_B_K0_M_K1 &a_b_k0_m_k1_grid_desc, const BGridDesc_B_K0_N_K1 &b_b_k0_n_k1_grid_desc, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock &c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation &a_element_op, const BElementwiseOperation &b_element_op, const CElementwiseOperation &c_element_op, const CBlockClusterAdaptor &c_block_cluster_adaptor) |
Static Public Attributes | |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| static constexpr auto | I4 = Number<4>{} |
| static constexpr auto | I5 = Number<5>{} |
| static constexpr auto | I6 = Number<6>{} |
| static constexpr auto | I7 = Number<7>{} |
| static constexpr auto | K1 = Number<K1Value>{} |
| static constexpr auto | M1PerBlock = Number<ABlockLdsM1PerBlock>{} |
| static constexpr auto | M0PerBlock = Number<ABlockLdsM0PerBlock>{} |
| static constexpr auto | M1Padding = Number<ABlockLdsM1Padding>{} |
| static constexpr auto | N1PerBlock = Number<BBlockLdsN1PerBlock>{} |
| static constexpr auto | N0PerBlock = Number<BBlockLdsN0PerBlock>{} |
| static constexpr auto | N1Padding = Number<BBlockLdsN1Padding>{} |
Member Typedef Documentation
◆ CBlockClusterAdaptor
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
| using ck::GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight< BlockSize, FloatA, FloatB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_B_K0_M_K1, BGridDesc_B_K0_N_K1, CMNGridDesc, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, MPerBlock, NPerBlock, K0PerBlock, MPerXdl, NPerXdl, K1Value, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, ABlockLdsM1PerBlock, ABlockLdsM0PerBlock, ABlockLdsM1Padding, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, BBlockLdsN1PerBlock, BBlockLdsN0PerBlock, BBlockLdsN1Padding, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CBlockTransferScalarPerVector_NWaveNPerXDL, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, ABlockLdsExtraM1Wrw, BBlockLdsExtraN1Wrw, NumGemmKPrefetchStage, PipelineVer, ComputeTypeA, ComputeTypeB >::CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1)) |
◆ CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
| using ck::GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight< BlockSize, FloatA, FloatB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_B_K0_M_K1, BGridDesc_B_K0_N_K1, CMNGridDesc, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, MPerBlock, NPerBlock, K0PerBlock, MPerXdl, NPerXdl, K1Value, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, ABlockLdsM1PerBlock, ABlockLdsM0PerBlock, ABlockLdsM1Padding, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, BBlockLdsN1PerBlock, BBlockLdsN0PerBlock, BBlockLdsN1Padding, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CBlockTransferScalarPerVector_NWaveNPerXDL, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, ABlockLdsExtraM1Wrw, BBlockLdsExtraN1Wrw, NumGemmKPrefetchStage, PipelineVer, ComputeTypeA, ComputeTypeB >::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock |
Initial value:
decltype(MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CMNGridDesc{}))
ck::GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
__host__ static __device__ constexpr auto MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CMNGridDesc &c_m_n_grid_desc)
Definition gridwise_gemm_xdlops_bwd_weight.hpp:608
◆ FloatAAdjusted
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
| using ck::GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight< BlockSize, FloatA, FloatB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_B_K0_M_K1, BGridDesc_B_K0_N_K1, CMNGridDesc, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, MPerBlock, NPerBlock, K0PerBlock, MPerXdl, NPerXdl, K1Value, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, ABlockLdsM1PerBlock, ABlockLdsM0PerBlock, ABlockLdsM1Padding, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, BBlockLdsN1PerBlock, BBlockLdsN0PerBlock, BBlockLdsN1Padding, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CBlockTransferScalarPerVector_NWaveNPerXDL, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, ABlockLdsExtraM1Wrw, BBlockLdsExtraN1Wrw, NumGemmKPrefetchStage, PipelineVer, ComputeTypeA, ComputeTypeB >::FloatAAdjusted = conditional_t<is_same_v<ComputeTypeA, ck::tf32_t>, float, ComputeTypeA> |
◆ FloatBAdjusted
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
| using ck::GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight< BlockSize, FloatA, FloatB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_B_K0_M_K1, BGridDesc_B_K0_N_K1, CMNGridDesc, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, MPerBlock, NPerBlock, K0PerBlock, MPerXdl, NPerXdl, K1Value, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, ABlockLdsM1PerBlock, ABlockLdsM0PerBlock, ABlockLdsM1Padding, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, BBlockLdsN1PerBlock, BBlockLdsN0PerBlock, BBlockLdsN1Padding, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CBlockTransferScalarPerVector_NWaveNPerXDL, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, ABlockLdsExtraM1Wrw, BBlockLdsExtraN1Wrw, NumGemmKPrefetchStage, PipelineVer, ComputeTypeA, ComputeTypeB >::FloatBAdjusted = conditional_t<is_same_v<ComputeTypeB, ck::tf32_t>, float, ComputeTypeB> |
◆ GridwiseGemmPipe
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
| using ck::GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight< BlockSize, FloatA, FloatB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_B_K0_M_K1, BGridDesc_B_K0_N_K1, CMNGridDesc, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, MPerBlock, NPerBlock, K0PerBlock, MPerXdl, NPerXdl, K1Value, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, ABlockLdsM1PerBlock, ABlockLdsM0PerBlock, ABlockLdsM1Padding, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, BBlockLdsN1PerBlock, BBlockLdsN0PerBlock, BBlockLdsN1Padding, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CBlockTransferScalarPerVector_NWaveNPerXDL, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, ABlockLdsExtraM1Wrw, BBlockLdsExtraN1Wrw, NumGemmKPrefetchStage, PipelineVer, ComputeTypeA, ComputeTypeB >::GridwiseGemmPipe |
Initial value:
constexpr auto GridwiseGemmPipeline_Selector()
Definition gridwise_gemm_pipeline_selector.hpp:31
◆ ThisThreadBlock
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
| using ck::GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight< BlockSize, FloatA, FloatB, FloatAcc, FloatC, CGlobalMemoryDataOperation, AGridDesc_B_K0_M_K1, BGridDesc_B_K0_N_K1, CMNGridDesc, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, MPerBlock, NPerBlock, K0PerBlock, MPerXdl, NPerXdl, K1Value, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, ABlockLdsM1PerBlock, ABlockLdsM0PerBlock, ABlockLdsM1Padding, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, BBlockLdsN1PerBlock, BBlockLdsN0PerBlock, BBlockLdsN1Padding, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CBlockTransferScalarPerVector_NWaveNPerXDL, CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, ABlockLdsExtraM1Wrw, BBlockLdsExtraN1Wrw, NumGemmKPrefetchStage, PipelineVer, ComputeTypeA, ComputeTypeB >::ThisThreadBlock = ThisThreadBlock<BlockSize> |
Member Function Documentation
◆ CalculateHasMainK0BlockLoop()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
inlinestaticconstexpr |
◆ CheckValidity()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
template<typename Block2CTileMap>
|
inlinestaticconstexpr |
◆ GetABlockDescriptor_Batch_K0PerBlock_MPerBlock_K1()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
inlinestaticconstexpr |
◆ GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
inlinestaticconstexpr |
◆ GetBBlockDescriptor_Batch_K0PerBlock_NPerBlock_K1()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
inlinestaticconstexpr |
◆ GetBBlockDescriptor_K0PerBlock_NPerBlock_K1()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
inlinestaticconstexpr |
◆ GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
inlinestaticconstexpr |
◆ GetSharedMemoryNumberOfByte()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
inlinestaticconstexpr |
◆ IsValidCompilationParameter()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
template<InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
|
inlinestaticconstexpr |
◆ MakeCBlockClusterAdaptor()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
inlinestaticconstexpr |
◆ MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
inlinestaticconstexpr |
◆ Run()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
template<bool HasMainKBlockLoop>
|
inlinestatic |
Member Data Documentation
◆ I0
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ I1
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ I2
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ I3
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ I4
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ I5
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ I6
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ I7
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ K1
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ M0PerBlock
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ M1Padding
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ M1PerBlock
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ N0PerBlock
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ N1Padding
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
◆ N1PerBlock
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename FloatC, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CMNGridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, index_t MPerBlock, index_t NPerBlock, index_t K0PerBlock, index_t MPerXdl, index_t NPerXdl, index_t K1Value, index_t MRepeat, index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool ABlockLdsExtraM, index_t ABlockLdsM1PerBlock, index_t ABlockLdsM0PerBlock, index_t ABlockLdsM1Padding, typename BBlockTransferThreadClusterLengths_K0_N_K1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_K1, bool BThreadTransferSrcResetCoordinateAfterRun, bool BBlockLdsExtraN, index_t BBlockLdsN1PerBlock, index_t BBlockLdsN0PerBlock, index_t BBlockLdsN1Padding, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, index_t CBlockTransferScalarPerVector_NWaveNPerXDL, typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, bool ABlockLdsExtraM1Wrw = false, bool BBlockLdsExtraN1Wrw = false, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1, typename ComputeTypeA = FloatA, typename ComputeTypeB = ComputeTypeA>
|
staticconstexpr |
The documentation for this struct was generated from the following file: