DeviceGemm_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, ELayout, ADataType, BDataType, EDataType, AccDataType, CShuffleDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer, ComputeDataType > Struct Template Reference#
Classes |
Public Types |
Public Member Functions |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, ELayout, ADataType, BDataType, EDataType, AccDataType, CShuffleDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer, ComputeDataType > Struct Template Reference
#include <device_gemm_xdl_cshuffle_lds_direct_load.hpp>
Inheritance diagram for ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, ELayout, ADataType, BDataType, EDataType, AccDataType, CShuffleDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer, ComputeDataType >:
Classes | |
| struct | Invoker |
Public Types | |
| template<index_t NXdlPerWave_> | |
| using | GridwiseGemmBase |
| using | GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)> |
| using | GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32> |
| using | Argument = typename GridwiseGemm64::Argument |
Public Member Functions | |
| bool | IsSupportedArgument (const BaseArgument *p_arg) override |
| std::unique_ptr< BaseArgument > | MakeArgumentPointer (const void *p_a, const void *p_b, void *p_e, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, index_t StrideE, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op) override |
| std::unique_ptr< BaseInvoker > | MakeInvokerPointer () override |
| std::string | GetTypeString () const override |
| Public Member Functions inherited from ck::tensor_operation::device::BaseOperator | |
| BaseOperator ()=default | |
| BaseOperator (const BaseOperator &)=default | |
| BaseOperator & | operator= (const BaseOperator &)=default |
| virtual std::string | GetInstanceString () const |
| virtual std::string | GetTypeIdName () const |
| virtual std::optional< std::string > | GetObjectName () const |
| virtual std::optional< std::string > | GetTemplateInfo () const |
| virtual std::string | GetTypeIdHashCode () const |
| virtual size_t | GetWorkSpaceSize (const BaseArgument *) const |
| virtual void | SetWorkSpacePointer (BaseArgument *p_arg, void *p_workspace, const StreamConfig &=StreamConfig{}) const |
| virtual | ~BaseOperator () |
Static Public Member Functions | |
| static bool | IsSupportedArgument (const Argument &arg) |
| static auto | MakeArgument (const void *p_a, const void *p_b, void *p_e, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, index_t StrideE, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op) |
| static auto | MakeInvoker () |
Static Public Attributes | |
| static GET_NXDL_PER_WAVE_IMPL constexpr auto | NXdlPerWave64 = GetNXdlPerWave<true>() |
| static constexpr auto | NXdlPerWave32 = GetNXdlPerWave<false>() |
| static constexpr auto | I1 = Number<1>{} |
Member Typedef Documentation
◆ Argument
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
| using ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, ELayout, ADataType, BDataType, EDataType, AccDataType, CShuffleDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer, ComputeDataType >::Argument = typename GridwiseGemm64::Argument |
◆ GridwiseGemm32
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
| using ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, ELayout, ADataType, BDataType, EDataType, AccDataType, CShuffleDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer, ComputeDataType >::GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32> |
◆ GridwiseGemm64
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
| using ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, ELayout, ADataType, BDataType, EDataType, AccDataType, CShuffleDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer, ComputeDataType >::GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)> |
◆ GridwiseGemmBase
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
template<index_t NXdlPerWave_>
| using ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_LdsDirectLoad< ALayout, BLayout, ELayout, ADataType, BDataType, EDataType, AccDataType, CShuffleDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferScalarPerVector, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferScalarPerVector, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVer, ComputeDataType >::GridwiseGemmBase |
Member Function Documentation
◆ GetTypeString()
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ IsSupportedArgument() [1/2]
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
|
inlinestatic |
◆ IsSupportedArgument() [2/2]
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ MakeArgument()
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
|
inlinestatic |
◆ MakeArgumentPointer()
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
|
inlineoverridevirtual |
◆ MakeInvoker()
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
|
inlinestatic |
◆ MakeInvokerPointer()
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
|
inlineoverridevirtual |
Member Data Documentation
◆ I1
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
|
staticconstexpr |
◆ NXdlPerWave32
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
|
staticconstexpr |
◆ NXdlPerWave64
template<typename ALayout, typename BLayout, typename ELayout, typename ADataType, typename BDataType, typename EDataType, typename AccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferScalarPerVector, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferScalarPerVector, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDEBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v4, typename ComputeDataType = EDataType>
|
staticconstexpr |
The documentation for this struct was generated from the following file: