ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector > Struct Template Reference#
Public Types |
Public Member Functions |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector > Struct Template Reference
#include <thread_group_tensor_slice_transfer_direct_load.hpp>
Public Types | |
| using | Index = MultiIndex<nDim> |
| using | SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})) |
| using | DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})) |
| using | SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) |
| using | DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) |
Public Member Functions | |
| __device__ constexpr | ThreadGroupTensorSliceTransfer_DirectLoad (const SrcDesc &src_desc, const Index &src_block_slice_origin, const DstDesc &dst_desc, const Index &dst_block_slice_origin) |
| __device__ void | SetSrcSliceOrigin (const SrcDesc &src_desc, const Index &src_slice_origin_idx) |
| __device__ void | SetDstSliceOrigin (const DstDesc &dst_desc, const Index &dst_slice_origin_idx) |
| __device__ void | ResetDstSliceWindow (const DstDesc &dst_desc) |
| template<typename SrcBuffer, typename DstBuffer> | |
| __device__ void | Run (const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf) |
| __device__ void | MoveSrcSliceWindow (const SrcDesc &src_desc, const Index &step) |
| template<typename DescType> | |
| __device__ auto | generate_steps (const DescType &desc, int sign) |
Static Public Member Functions | |
| static __device__ constexpr bool | AreThreadClusterLengthsValid () |
Static Public Attributes | |
| static constexpr index_t | nDim = remove_reference_t<SrcDesc>::GetNumOfDimension() |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | block_slice_lengths = BlockSliceLengths{} |
| static constexpr auto | thread_cluster_lengths = ThreadClusterLengths{} |
| static constexpr auto | thread_single_load_size |
| static constexpr auto | thread_steps = thread_cluster_lengths * thread_single_load_size |
| static constexpr auto | thread_slice_lengths = block_slice_lengths / thread_steps |
Detailed Description
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
struct ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >
struct ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >
Transfer that uses direct load instructions to copy data from global to LDS memory.
Traditional loads first copy data from global to registers, and then from registers to LDS. Direct loads do not need an intermediate step, data is copied directly from global to LDS, without the use of additional registers.
However, the instruction has limitations:
- each thread must copy exactly a single DWORD - 4 bytes;
- threads within a single wavefront must write consecutive DWORDS into LDS, (data in global do not need to be contiguous, each thread might have its own offset).
To make sure that all the transfers finished, the waitcnt instruction must be used with vmcnt instead of lgkmcnt.
Limitations of the transfer class:
- SrcData must be the same as DstData - no possibility to convert the data type in flight;
- DstVectorDim must be the last dimension;
- SrcVectorDim must be the last dimension if ScalarPerVector is greater than 1;
- ScalarPerVector times the number of bytes of DstData must be equal to a single DWORD = 4B (for examlpe if DstData is fp32, then ScalarPerVector must be 1; if DstData is fp16, ScalarPerVector must be 2);
- if ScalarPerVector is greater than 1, the contiguous dimension in src and dst must be the same dimension;
- threads in a wavefront must write contiguous data to LDS (when wavefront size is 64, they must write 64 contiguous DWORDs) - ThreadClusterLengths must be prepared in such a way to guarantee that.
Member Typedef Documentation
◆ DstCoord
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
| using ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})) |
◆ DstCoordStep
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
| using ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) |
◆ Index
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
| using ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::Index = MultiIndex<nDim> |
◆ SrcCoord
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
| using ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})) |
◆ SrcCoordStep
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
| using ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) |
Constructor & Destructor Documentation
◆ ThreadGroupTensorSliceTransfer_DirectLoad()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
inlineconstexpr |
Member Function Documentation
◆ AreThreadClusterLengthsValid()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
inlinestaticconstexpr |
◆ generate_steps()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
template<typename DescType>
|
inline |
◆ MoveSrcSliceWindow()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
inline |
◆ ResetDstSliceWindow()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
inline |
◆ Run()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
template<typename SrcBuffer, typename DstBuffer>
|
inline |
◆ SetDstSliceOrigin()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
inline |
◆ SetSrcSliceOrigin()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
inline |
Member Data Documentation
◆ block_slice_lengths
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
staticconstexpr |
◆ I0
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
staticconstexpr |
◆ I1
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
staticconstexpr |
◆ nDim
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
staticconstexpr |
◆ thread_cluster_lengths
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
staticconstexpr |
◆ thread_single_load_size
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
staticconstexpr |
Initial value:
__host__ __device__ constexpr auto generate_sequence(F, Number< N >)
Definition sequence_helper.hpp:18
Definition threadwise_tensor_slice_transfer_util.hpp:20
◆ thread_slice_lengths
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
staticconstexpr |
◆ thread_steps
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
|
staticconstexpr |
The documentation for this struct was generated from the following file: