Overview

The AIE API encapsulates the matrix multiplication functionality in the aie::mmul class template.

This class template is parametrized with the matrix multiplication shape (MxKxN), the data types and, optionally, the requested accmululation precision. The resulting class defines a function that performs the multiplication and a data type for the result that can be converted to an accumulator/vector. The function interprets the input vectors as matrices as described by the shape parameters.

The following code snippet shows a portable sample blocked multiplication using the aie::mmul class. The matrices are assumed to be pre-tiled as defined by the mmul shape (MxK for A, KxN for B, and MxN for C).

template <unsigned M, unsigned K, unsigned N>
void mmul_blocked(unsigned rowA, unsigned colA, unsigned colB,
                  const int16 * __restrict pA, const int16 * __restrict pB, int16 * __restrict pC)
{
   using MMUL = aie::mmul<M, K, N, int16, int16>;
 
   for (unsigned z = 0; z < rowA; z += 2) chess_loop_range(2,) {
       int16 * __restrict pC1 = pC + (      z * colB +       0) * MMUL::size_C;
       int16 * __restrict pC2 = pC + ((z + 1) * colB +       0) * MMUL::size_C;
 
       for (unsigned j = 0; j < colB; j += 2) chess_loop_range(2,) {
           const int16 * __restrict pA1 = pA + (      z * colA +       0) * MMUL::size_A;
           const int16 * __restrict pA2 = pA + ((z + 1) * colA +       0) * MMUL::size_A;
           const int16 * __restrict pB1 = pB + (      0 * colB +       j) * MMUL::size_B;
           const int16 * __restrict pB2 = pB + (      0 * colB + (j + 1)) * MMUL::size_B;
 
           aie::vector<int16, MMUL::size_A> A0 = aie::load_v<MMUL::size_A>(pA1); pA1 += MMUL::size_A;
           aie::vector<int16, MMUL::size_A> A1 = aie::load_v<MMUL::size_A>(pA2); pA2 += MMUL::size_A;
           aie::vector<int16, MMUL::size_B> B0 = aie::load_v<MMUL::size_B>(pB1); pB1 += MMUL::size_B * colB;
           aie::vector<int16, MMUL::size_B> B1 = aie::load_v<MMUL::size_B>(pB2); pB2 += MMUL::size_B * colB;
 
           MMUL C00; C00.mul(A0, B0);
           MMUL C01; C01.mul(A0, B1);
           MMUL C10; C10.mul(A1, B0);
           MMUL C11; C11.mul(A1, B1);
 
           for (unsigned i = 1; i < colA; ++i) chess_prepare_for_pipelining chess_loop_range(3,) {
               A0 = aie::load_v<MMUL::size_A>(pA1); pA1 += MMUL::size_A;
               A1 = aie::load_v<MMUL::size_A>(pA2); pA2 += MMUL::size_A;
               B0 = aie::load_v<MMUL::size_B>(pB1); pB1 += MMUL::size_B * colB;
               B1 = aie::load_v<MMUL::size_B>(pB2); pB2 += MMUL::size_B * colB;
 
               C00.mac(A0, B0);
               C01.mac(A0, B1);
               C10.mac(A1, B0);
               C11.mac(A1, B1);
           }
 
           aie::store_v(pC1, C00.template to_vector<int16>()); pC1 += MMUL::size_C;
           aie::store_v(pC1, C01.template to_vector<int16>()); pC1 += MMUL::size_C;
           aie::store_v(pC2, C10.template to_vector<int16>()); pC2 += MMUL::size_C;
           aie::store_v(pC2, C11.template to_vector<int16>()); pC2 += MMUL::size_C;
       }
   }
}

Classes
struct	aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >
	Type that encapsulates a blocked matrix multiplication C = A x B. More...

Matrix Multiplication Modes

Supported Matrix Multiplication Modes

Matrix multiplication modes for real types
Arch.	8b x 4b	8b x 8b	16b x 8b	8b x 16b	16b x 16b	32b x 16b	16b x 32b	32b x 32b	bfloat16 x bfloat16	float x float
AIE		4x8x4 4x16x4^a 8x8x4^a 2x8x8 4x8x8^a 1x16x8 2x16x8^a 4x16x8^a	4x4x4 8x4x4^a 4x8x4^a 4x4x8^a	4x4x8^a 4x4x4^a 8x8x1^ab	4x4x4^a 2x4x8^a 4x4x8^a 4x2x8^a 8x8x1^ab	2x4x8^a 4x4x4^a 4x2x4^a 2x2x4 2x4x4^a 4x4x2^a 2x2x8^a	4x2x2 2x4x8^a 4x4x4^a	4x2x4^a 2x2x2 2x4x2^a 2x8x2^a 4x2x2^a 4x4x2^a 2x4x4^a 4x4x1^a		4x2x4^a 2x2x2^a 2x4x2^ab 2x8x2^ab 4x2x2^a 4x4x2^a 2x4x4^a 4x4x1^ab
AIE-ML	4x16x8 8x16x8^a 4x32x8^ab	4x8x4^ab 4x16x4^ab 8x8x4^ab 2x8x8 4x8x8 8x8x8^a 1x16x8^ab 2x16x8^ab 4x16x8^ab	4x4x4^ab 8x4x4^ab 4x8x4 4x4x8 8x4x8^ab 2x8x8	4x4x8^ab 4x4x4^ab	4x4x4 2x4x8 4x4x8^ab 4x2x8 8x2x8^a 8x1x8^ab	2x4x8 4x4x8^ab 4x4x4 4x2x4 4x1x8^ab	2x4x8 4x4x4	4x2x4^a 4x4x4^ab 8x2x4^a 4x1x8^ab 8x1x8^ab	4x8x4 8x8x4^a 4x16x8^ab	4x8x4 4x1x4^b 4x1x8^ab

Matrix multiplication modes for complex types (c16b/c32b/cfloat represent complex types)
Arch.	16b x c16b	16b x c32b	c16b x 16b	c16b x c16b	c16b x 32b	c16b x c32b	32b x c16b	32b x c32b	c32b x 16b	c32b x c16b	c32b x 32b	c32b x c32b	float x cfloat	cfloat x float	cfloat x cfloat
AIE	4x2x2 4x4x4^a 4x4x1	2x4x2^a 2x4x4^a 2x8x2^a 4x4x2^a 4x4x1^a	2x2x4 2x2x8^a 2x4x4^a 2x4x8^a 4x2x4^a 4x4x2^a 4x4x4^a	2x2x2 2x4x2^a 2x8x2^a 2x4x4^a 4x2x2^a 4x4x2^a 4x2x4^a 4x4x1^a	2x2x2 2x4x2^a 2x8x2^a 2x4x4^a 4x2x2^a 4x4x2^a 4x2x4^a 4x4x1^a	2x2x2^a 2x4x2^a 4x2x1^a	2x2x2 2x4x2^a 2x8x2^a 2x4x4^a 4x2x2^a 4x4x2^a 4x2x4^a 4x4x1^a	2x2x2^a 2x4x2^a 4x2x1^a	2x4x2^a 2x8x2^a 2x4x4^a 4x4x2^a	2x2x2^a 2x4x2^a 4x4x1^a	1x2x2 2x2x2^a 2x4x2^a 4x4x1^a	1x2x2^a 2x2x1^a 2x2x1	2x2x2^a 2x4x2^a 4x2x1^a	2x2x2^a 2x4x2^a 4x4x1^a 2x4x1^ab	2x2x2^a 2x2x4^a 2x4x2^a 4x2x2^a 4x2x1^a
AIE-ML			2x4x8^ab 4x4x4^ab	1x4x8^ab 2x4x8^ab						1x2x4^ab 1x2x8^ab 2x2x8^ab 1x4x8^ab 2x4x8^ab		1x2x8^ab

Note: ^a - Emulated using multiple intrinsic calls.
^b - Require additional data manipulation.

GEMM leveraging multidimensional addressing

Note: Multi-dimensional addressing and the corresponding tensor buffer streams were introduced with AIE-ML

Below is an example of an optimized bfloat16 GEMM kernel in which both input matrices, A and B, are addressed in the following 4D patterns:

It is assumed that the data for both input matrices are pre-tiled and that the tiles are laid out in column-major order in memory.

void gemm_bf16xbf16(bfloat16 * matA, bfloat16 * matB, bfloat16 *__restrict matC,
                    int rowsA, int inner, int colsB)
{
    using MMUL = aie::mmul<4, 8, 4, bfloat16, bfloat16, accauto>;
 
    auto a_desc =  aie::make_tensor_descriptor<bfloat16, 32>(
                                               aie::tensor_dim(rowsA / 4 / 4, 4),
                                               aie::tensor_dim(colsB / 4 / 4, 0),
                                               aie::tensor_dim(inner / 8, rowsA / 4),
                                               aie::tensor_dim(4u, 1));
 
    auto b_desc = aie::make_tensor_descriptor<bfloat16, 32>(
                                               aie::tensor_dim(colsB / 4 / 4, 0),
                                               aie::tensor_dim(colsB / 4 / 4, inner / 8 * 4),
                                               aie::tensor_dim(inner / 8, 1),
                                               aie::tensor_dim(4u, inner / 8));
 
    auto c_desc = aie::make_tensor_descriptor<bfloat16, 16>(
                                               aie::tensor_dim(rowsA / 4 / 4, 4),
                                               aie::tensor_dim(colsB / 4, rowsA / 4),
                                               aie::tensor_dim(4u, 1));
 
    auto tsA = aie::make_tensor_buffer_stream(matA, a_desc);
    auto tsB = aie::make_tensor_buffer_stream(matB, b_desc);
    auto tsC = aie::make_restrict_tensor_buffer_stream(matC, c_desc);
 
    for (int j = 0; j < rowsA * colsB / (16 * 16); ++j)
    {
        MMUL C00, C01, C02, C03;
        MMUL C10, C11, C12, C13;
        MMUL C20, C21, C22, C23;
        MMUL C30, C31, C32, C33;
 
        for (int i = 0; i < inner / 8; ++i)
        {
            // The following pop calls are required to access the inner leaf stream.
            // As tsA and tsB are 4D streams, the returned inner stream will be 1D.
            //
            // Note that these calls advance the outer stream
            auto tsA_inner = tsA.pop();
            auto tsB_inner = tsB.pop();
 
            aie::vector<bfloat16,32> Xbuff0, Xbuff1, Xbuff2, Xbuff3;
            tsA_inner >> Xbuff0 >> Xbuff1 >> Xbuff2 >> Xbuff3;
 
            aie::vector<bfloat16,32> Ybuff0, Ybuff1;
            tsB_inner >> Ybuff0 >> Ybuff1;
 
            C00.mac(Xbuff0, Ybuff0); C01.mac(Xbuff0, Ybuff1);
            C10.mac(Xbuff1, Ybuff0); C11.mac(Xbuff1, Ybuff1);
            C20.mac(Xbuff2, Ybuff0); C21.mac(Xbuff2, Ybuff1);
            C30.mac(Xbuff3, Ybuff0); C31.mac(Xbuff3, Ybuff1);
 
            tsB_inner >> Ybuff0 >> Ybuff1;
 
            C02.mac(Xbuff0, Ybuff0); C03.mac(Xbuff0, Ybuff1);
            C12.mac(Xbuff1, Ybuff0); C13.mac(Xbuff1, Ybuff1);
            C22.mac(Xbuff2, Ybuff0); C23.mac(Xbuff2, Ybuff1);
            C32.mac(Xbuff3, Ybuff0); C33.mac(Xbuff3, Ybuff1);
        }
 
        tsC << C00.to_vector<bfloat16>() << C10.to_vector<bfloat16>() << C20.to_vector<bfloat16>() << C30.to_vector<bfloat16>()
            << C01.to_vector<bfloat16>() << C11.to_vector<bfloat16>() << C21.to_vector<bfloat16>() << C31.to_vector<bfloat16>()
            << C02.to_vector<bfloat16>() << C12.to_vector<bfloat16>() << C22.to_vector<bfloat16>() << C32.to_vector<bfloat16>()
            << C03.to_vector<bfloat16>() << C13.to_vector<bfloat16>() << C23.to_vector<bfloat16>() << C33.to_vector<bfloat16>();
    }
}

Supported Sparse Matrix Multiplication Modes

AIE-ML introduced hardware support for sparse matrix multiplication. For an M x K x N matrix multiplication with A being M x K, B being K x N, and C being M x N, a sparse B matrix may be stored in memory using a data layout which avoids storing zero values.

Note: Sparse matrix multiplications require that the sparse data be stored in column major layout. An internal transpose of the partially decompressed data is required by the underlying intrinsics and is carried out automatically by the API.

Matrix multiplication modes for real types (sparse B matrix)
Arch.	8b x 4b	8b x 8b	16b x 8b	16b x 16b	bfloat16 x bfloat16
AIE-ML	4x32x8	4x16x8 8x16x8^a 4x16x16^ab	2x16x8 4x16x8^a	2x8x8 4x8x8^a 2x8x16^ab	4x16x4 4x16x8^ab

Note: ^a - Emulated using multiple intrinsic calls
^b - Require additional data manipulation

The following example shows an optimized int8 * sparse int8 GEMM:

void gemm_int8xint8_sparse(int8 * matA, int8 * matB, int8 *__restrict matC,
                           int rowsA, int inner, int colsB)
{
    using MMUL = aie::mmul<4, 16, 8, int8, int8, accauto>;
 
    auto a_desc = aie::make_tensor_descriptor<int8, 64>(aie::tensor_dim(rowsA / 4 / 4, 2),
                                                        aie::tensor_dim(colsB / 4 / 4, 0),
                                                        aie::tensor_dim(inner / 8, rowsA / 8),
                                                        aie::tensor_dim(2u, 1));
 
    auto c_desc = aie::make_tensor_descriptor<int8, 32>(aie::tensor_dim(rowsA / 4 / 4, 4),
                                                        aie::tensor_dim(colsB / 8, rowsA / 4),
                                                        aie::tensor_dim(4u, 1));
 
    auto tsA = aie::make_tensor_buffer_stream<aie_dm_resource::a>(matA, a_desc);
    auto tsC = aie::make_restrict_tensor_buffer_stream(matC, c_desc);
 
    for (int j = 0; j < rowsA / 16; j++)
        chess_loop_range(2,)
    {
        auto tsB = aie::sparse_vector_input_buffer_stream<int8, 128, aie_dm_resource::a>(matB);
 
        for (int b = 0; b < colsB / 16; b++)
            chess_prepare_for_pipelining
            chess_loop_range(2,)
        {
            MMUL C00, C01;
            MMUL C10, C11;
            MMUL C20, C21;
            MMUL C30, C31;
 
            for (int i = 0; i < inner / 16; i++)
                chess_prepare_for_pipelining
                chess_loop_range(4,)
            {
                aie::vector<int8,64> Sbuff0, Sbuff1, Sbuff2, Sbuff3;
                tsA.pop() >> Sbuff0 >> Sbuff1;
                tsA.pop() >> Sbuff2 >> Sbuff3;
 
                auto [Xbuff0, Xbuff1] = aie::interleave_zip(Sbuff0, Sbuff2, 8);
                auto [Xbuff2, Xbuff3] = aie::interleave_zip(Sbuff1, Sbuff3, 8);
 
                aie::sparse_vector<int8,128> Ybuff0, Ybuff1;
                tsB >> Ybuff0 >> Ybuff1;
 
                C00.mac(Xbuff0, Ybuff0); C01.mac(Xbuff0, Ybuff1);
                C10.mac(Xbuff1, Ybuff0); C11.mac(Xbuff1, Ybuff1);
                C20.mac(Xbuff2, Ybuff0); C21.mac(Xbuff2, Ybuff1);
                C30.mac(Xbuff3, Ybuff0); C31.mac(Xbuff3, Ybuff1);
            }
 
            tsC << C00.to_vector<int8>() << C10.to_vector<int8>() << C20.to_vector<int8>() << C30.to_vector<int8>()
                << C01.to_vector<int8>() << C11.to_vector<int8>() << C21.to_vector<int8>() << C31.to_vector<int8>();
        }
    }
}

Class Documentation

◆ aie::mmul

struct aie::mmul

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>
struct aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >

Type that encapsulates a blocked matrix multiplication C = A x B.

Objects of this type encapsulate the current result of the multiplication. The first result is computed with the mul method. New multiplications can be accumulated using the mac method.

Template Parameters

M_Elems	Rows in matrix A.
K_Elems	Columns in matrix A / Rows in matrix B.
N_Elems	Columns in matrix B.
TypeA	Type of the elements in matrix A. It must meet aie::ElemBaseType.
TypeB	Optional. Type of the elements in matrix B. By default is the same as TypeA. It must meet aie::ElemBaseType.
AccumTag	Optional. Type of the elements of the accumulator that contains the results to be written in matrix C. It must meet aie::AccumElemBaseType. If not specified, it uses the default accumulation type for multiplications of TypeA x TypeB.

Public Types
using	accum_type = typename mmul_impl::accum_type

using	mmul_impl = detail::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, detail::to_native_accum_bits_for_mul_types_tag< TypeA, TypeB, AccumTag >()>

Public Member Functions
	mmul ()
	Constructor.

	mmul (const accum_type &acc)
	Constructor.

	mmul (const binary_op< accum_type, bool, Operation::Zero > &op)
	Constructor.

template<typename T >
	mmul (const vector< T, size_C > &v, int shift=0)
	Constructor.

template<VectorOrOp VecA, VectorOrOp VecB> requires (VecA::size() == size_A && VecB::size() == size_B && std::is_same_v<typename VecA::value_type, TypeA> && std::is_same_v<typename VecB::value_type, TypeB>)
void	mac (const VecA &a, const VecB &b)
	Multiply the two given matrices and add it to the result.

template<VectorOrOp VecA, SparseVectorOrOp VecB> requires (arch::is(arch::AIE_ML) && VecA::size() == size_A && VecB::size() == size_B && std::is_same_v<typename VecA::value_type, TypeA> && std::is_same_v<typename VecB::value_type, TypeB>)
void	mac (const VecA &a, const VecB &b)
	Multiply the two given matrices and add it to the result.

template<VectorOrOp VecA, VectorOrOp VecB> requires (VecA::size() == size_A && VecB::size() == size_B && std::is_same_v<typename VecA::value_type, TypeA> && std::is_same_v<typename VecB::value_type, TypeB>)
void	mul (const VecA &a, const VecB &b)
	Initialize the result value with the multiplication of the two given matrices.

template<VectorOrOp VecA, SparseVectorOrOp VecB> requires (arch::is(arch::AIE_ML) && VecA::size() == size_A && VecB::size() == size_B && std::is_same_v<typename VecA::value_type, TypeA> && std::is_same_v<typename VecB::value_type, TypeB>)
void	mul (const VecA &a, const VecB &b)
	Initialize the result value with the multiplication of the two given matrices.

	operator accum_type () const
	Conversion operator to accumulator.

mmul &	operator= (const accum_type &acc)
	Reinitialize the mmul object using the given accumulator.

accum_type	to_accum () const
	Return the result of the multiplication as an accumulator.

template<typename T >
vector< T, size_C >	to_vector (int shift=0) const
	Return the result of the multiplication as a vector of the requested type.

Static Public Member Functions
static constexpr unsigned	size ()
	Returns number of elements in matrix C.

Static Public Attributes
static constexpr unsigned	K = K_Elems
	Number of columns in matrix A, and number of rows in matrix B.

static constexpr unsigned	M = M_Elems
	Number of rows in matrix A.

static constexpr unsigned	N = N_Elems
	Number of columns in matrix B.

static constexpr unsigned	size_A = M * K
	Number of elements in matrix A.

static constexpr unsigned	size_B = K * N
	Number of elements in matrix B.

static constexpr unsigned	size_C = M * N
	Number of elements in matrix C.

Member Typedef Documentation

◆ accum_type

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

using aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::accum_type = typename mmul_impl::accum_type

◆ mmul_impl

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

using aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::mmul_impl = detail::mmul<M_Elems, K_Elems, N_Elems, TypeA, TypeB, detail::to_native_accum_bits_for_mul_types_tag<TypeA, TypeB, AccumTag>()>

Constructor & Destructor Documentation

◆ mmul() [1/4]

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::mmul ( )

inline

Constructor.

Data is undefined.

◆ mmul() [2/4]

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::mmul ( const accum_type & acc )

inline

Constructor.

Data is initialized from the given accumulator.

Data is expected to be row-major layout.

Parameters

acc	Accumulator data is initialized from.

◆ mmul() [3/4]

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::mmul ( const binary_op< accum_type, bool, Operation::Zero > & op )

inline

Constructor.

Data is initialized from the given operation modifier.

Parameters

op	aie::op_zero operation.

◆ mmul() [4/4]

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

template<typename T >

aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::mmul	(	const vector< T, size_C > &	v,
		int	shift = `0`
	)

inline

Constructor.

Data is initialized from the given vector.

Data is expected to be row-major layout.

Parameters

v	Vector data is initialized from.
shift	Upshift in bits to be applied to input data. This parameter is ignored for floating-point types.

Member Function Documentation

◆ mac() [1/2]

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

template<VectorOrOp VecA, VectorOrOp VecB>
requires (VecA::size() == size_A && VecB::size() == size_B && std::is_same_v<typename VecA::value_type, TypeA> && std::is_same_v<typename VecB::value_type, TypeB>)

void aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::mac	(	const VecA &	a,
		const VecB &	b
	)

inline

Multiply the two given matrices and add it to the result.

Parameters

a	Represents the A input matrix with row-major data layout. The number of elements must be mmul::size_A (M * K). It must meet aie::VectorOrOp.
b	Represents the B input matrix with row-major data layout. The number of elements must be mmul::size_B (K * N). It must meet aie::VectorOrOp.

◆ mac() [2/2]

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

template<VectorOrOp VecA, SparseVectorOrOp VecB>
requires (arch::is(arch::AIE_ML) && VecA::size() == size_A && VecB::size() == size_B && std::is_same_v<typename VecA::value_type, TypeA> && std::is_same_v<typename VecB::value_type, TypeB>)

void aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::mac	(	const VecA &	a,
		const VecB &	b
	)

inline

Multiply the two given matrices and add it to the result.

Matrix B is sparse.

Parameters

a	Vector that represents the A input matrix.
b	Sparse vector that represents the B input matrix.

◆ mul() [1/2]

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

template<VectorOrOp VecA, VectorOrOp VecB>
requires (VecA::size() == size_A && VecB::size() == size_B && std::is_same_v<typename VecA::value_type, TypeA> && std::is_same_v<typename VecB::value_type, TypeB>)

void aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::mul	(	const VecA &	a,
		const VecB &	b
	)

inline

Initialize the result value with the multiplication of the two given matrices.

Parameters

a	Represents the A input matrix with row-major data layout. The number of elements must be mmul::size_A (M * K). It must meet aie::VectorOrOp.
b	Represents the B input matrix with row-major data layout. The number of elements must be mmul::size_B (K * N). It must meet aie::VectorOrOp.

◆ mul() [2/2]

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

template<VectorOrOp VecA, SparseVectorOrOp VecB>
requires (arch::is(arch::AIE_ML) && VecA::size() == size_A && VecB::size() == size_B && std::is_same_v<typename VecA::value_type, TypeA> && std::is_same_v<typename VecB::value_type, TypeB>)

void aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::mul	(	const VecA &	a,
		const VecB &	b
	)

inline

Initialize the result value with the multiplication of the two given matrices.

Matrix B is sparse.

Parameters

a	Vector that represents the A input matrix.
b	Sparse vector that represents the B input matrix.

◆ operator accum_type()

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::operator accum_type ( ) const

inline

Conversion operator to accumulator.

◆ operator=()

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

mmul & aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::operator= ( const accum_type & acc )

inline

Reinitialize the mmul object using the given accumulator.

Parameters

acc	Accumulator data is initialized from.

◆ size()

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

static constexpr unsigned aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::size ( )

inlinestaticconstexpr

Returns number of elements in matrix C.

◆ to_accum()

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

accum_type aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::to_accum ( ) const

inline

Return the result of the multiplication as an accumulator.

◆ to_vector()

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

template<typename T >

vector< T, size_C > aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::to_vector ( int shift = 0 ) const

inline

Return the result of the multiplication as a vector of the requested type.

Parameters

shift Downshift in bits to be applied to output data. This parameter is ignored for floating-point types.

Member Data Documentation

◆ K

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

constexpr unsigned aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::K = K_Elems

staticconstexpr

Number of columns in matrix A, and number of rows in matrix B.

◆ M

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

constexpr unsigned aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::M = M_Elems

staticconstexpr

Number of rows in matrix A.

◆ N

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

constexpr unsigned aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::N = N_Elems

staticconstexpr

Number of columns in matrix B.

◆ size_A

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

constexpr unsigned aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::size_A = M * K

staticconstexpr

Number of elements in matrix A.

◆ size_B

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

constexpr unsigned aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::size_B = K * N

staticconstexpr

Number of elements in matrix B.

◆ size_C

template<unsigned M_Elems, unsigned K_Elems, unsigned N_Elems, ElemBaseType TypeA, ElemBaseType TypeB = TypeA, AccumElemBaseType AccumTag = accauto>

constexpr unsigned aie::mmul< M_Elems, K_Elems, N_Elems, TypeA, TypeB, AccumTag >::size_C = M * N

staticconstexpr

Number of elements in matrix C.

Overview

Classes

Matrix Multiplication Modes

Supported Matrix Multiplication Modes

GEMM leveraging multidimensional addressing

Supported Sparse Matrix Multiplication Modes

Class Documentation

◆ aie::mmul

Public Types

Public Member Functions

Static Public Member Functions

Static Public Attributes

Member Typedef Documentation

◆ accum_type

◆ mmul_impl

Constructor & Destructor Documentation

◆ mmul() [1/4]

◆ mmul() [2/4]

◆ mmul() [3/4]

◆ mmul() [4/4]

Member Function Documentation

◆ mac() [1/2]

◆ mac() [2/2]

◆ mul() [1/2]

◆ mul() [2/2]

◆ operator accum_type()

◆ operator=()

◆ size()

◆ to_accum()

◆ to_vector()

Member Data Documentation

◆ K

◆ M

◆ N

◆ size_A

◆ size_B

◆ size_C