By default the input and output type are the same, but you can optionally specify a second type for a different output type.
The following snippet shows the sample implementation of a radix-2 FFT stage for any vectorization value.
template <unsigned Vectorization>
void radix2_dit(const cint32 * __restrict x,
const cint16 * __restrict tw,
unsigned n,
unsigned shift_tw,
unsigned shift,
bool inv,
cint32 * __restrict y)
{
FFT fft;
auto it_stage = fft.begin_stage(x, tw);
auto it_out0 = aie::begin_vector<FFT::out_vector_size>(y);
auto it_out1 = aie::begin_vector<FFT::out_vector_size>(y + n / 2);
for (int j = 0; j < n / (2 * FFT::out_vector_size); ++j)
chess_prepare_for_pipelining
chess_loop_range(1,)
{
const auto out = fft.dit(*it_stage++, shift_tw,
shift,
inv);
*it_out0++ = out[0];
*it_out1++ = out[1];
}
}
auto inv(E a)
Definition: aie.hpp:7831
shift_bits< T, type_bits_v< T >, Elems > shift
Definition: shift.hpp:119
Definition: aie_declaration.hpp:85
Supported FFT/IFFT Modes
Input Type | Output Type | Radix |
c16b | c16b | 2 |
c16b | c32b | 2 3 4 5 |
c32b | c16b | 2 4 |
c32b | c32b | 2 3 4 5 |
cfloat | cfloat | 2 |
For a given radix R with n number of points, there will be R number of outputs spaced by n/R. The above example shows 2 outputs for a radix 2, for radix 4 we would have:
template <unsigned Vectorization>
void radix4_dit(const cint32 * __restrict x,
const cint16 * __restrict tw1,
const cint16 * __restrict tw2,
unsigned n,
unsigned shift_tw,
unsigned shift,
bool inv,
cint32 * __restrict y)
{
FFT fft;
auto it_stage = fft.begin_stage(x, tw1, tw2);
auto it_out0 = aie::begin_restrict_vector<FFT::out_vector_size>(y);
auto it_out1 = aie::begin_restrict_vector<FFT::out_vector_size>(y + n / 4);
auto it_out2 = aie::begin_restrict_vector<FFT::out_vector_size>(y + 2*n / 4);
auto it_out3 = aie::begin_restrict_vector<FFT::out_vector_size>(y + 3*n / 4);
for (int j = 0; j < n / (4 * FFT::out_vector_size); ++j)
chess_prepare_for_pipelining
chess_loop_range(1,)
{
const auto out = fft.dit(*it_stage++, shift_tw,
shift,
inv);
*it_out0++ = out[0];
*it_out1++ = out[1];
*it_out2++ = out[2];
*it_out3++ = out[3];
}
}
However you can also use less with different incrementation schemes to achieve the same result. This approach is currently recommended for optimal performance on c32b radix 4 for Vectorization > 1 due to internal pointer requirements:
template <unsigned Vectorization>
void radix4_dit(const cint32 * __restrict x,
const cint16 * __restrict tw1,
const cint16 * __restrict tw2,
unsigned n,
unsigned shift_tw,
unsigned shift,
bool inv,
cint32 * __restrict y)
{
FFT fft;
auto it_stage = fft.begin_stage(x, tw1, tw2);
auto it_out0 = aie::begin_restrict_vector<FFT::out_vector_size>(y);
auto it_out1 = aie::begin_restrict_vector<FFT::out_vector_size>(y + n / 2);
for (int j = 0; j < n / (4 * FFT::out_vector_size); ++j)
chess_prepare_for_pipelining
chess_loop_range(1,)
{
const auto out = fft.dit(*it_stage++, shift_tw,
shift,
inv);
*it_out0 = out[0]; it_out0 += n/16;
*it_out0 = out[1]; it_out0 += -(int)(n/16)+1;
*it_out1 = out[2]; it_out1 += n/16;
*it_out1 = out[3]; it_out1 += -(int)(n/16)+1;
}
}
Odd radix FFTs are also supported with radix 3 and 5 being implemented as:
template <unsigned Vectorization>
void radix3_dit(const cint32 * __restrict x,
const cint16 * __restrict tw1,
const cint16 * __restrict tw2,
unsigned n,
unsigned shift_tw,
unsigned shift,
bool inv,
cint32 * __restrict y)
{
FFT fft;
constexpr unsigned one_third_Q15 = 10923;
const int block_size = (n * one_third_Q15) >> 15;
auto it_stage = fft.begin_stage(x, tw1, tw2);
auto it_out0 = aie::begin_restrict_vector<FFT::out_vector_size>(y);
auto it_out1 = aie::begin_restrict_vector<FFT::out_vector_size>(y + block_size);
auto it_out2 = aie::begin_restrict_vector<FFT::out_vector_size>(y + 2 * block_size);
for (int j = 0; j < block_size / FFT::out_vector_size; ++j)
chess_prepare_for_pipelining
chess_loop_range(1,)
{
const auto out = fft.dit(*it_stage++, shift_tw,
shift,
inv);
*it_out0 = out[0];
*it_out1 = out[1];
*it_out2 = out[2];
}
}
and
template <unsigned Vectorization>
void radix5_dit(const cint32 * __restrict x,
const cint16 * __restrict tw1,
const cint16 * __restrict tw2,
const cint16 * __restrict tw3,
const cint16 * __restrict tw4,
unsigned n,
unsigned shift_tw,
unsigned shift,
bool inv,
cint32 * __restrict y)
{
FFT fft;
constexpr unsigned one_fifth_Q15 = 6554;
const int block_size = (n * one_fifth_Q15) >> 15;
auto it_stage = fft.begin_stage(x, tw1, tw2, tw3, tw4);
auto it_out = aie::begin_restrict_vector<FFT::out_vector_size>(y);
for (int j = 0; j < block_size / FFT::out_vector_size; ++j)
chess_prepare_for_pipelining
chess_loop_range(1,)
{
const auto out = fft.dit(*it_stage++, shift_tw,
shift,
inv);
*it_out = out[0]; it_out += block_size;
*it_out = out[1]; it_out += block_size;
*it_out = out[2]; it_out += block_size;
*it_out = out[3]; it_out += block_size;
*it_out = out[4]; it_out += -4 * block_size + 1;
}
}
Going to/from cint16/cint32 is also supported. Here is an example of a radix 2 downscaling from cint32 to cint16:
template <unsigned Vectorization>
void radix2_down_dit(
const cint32_t * __restrict x,
int n,
int shift_tw,
int shift,
bool inv,
{
FFT fft;
auto it_stage = fft.begin_stage(x, tw);
auto it_out0 = aie::begin_vector<FFT::out_vector_size>(y);
auto it_out1 = aie::begin_vector<FFT::out_vector_size>(y + n / 2);
for (int j = 0; j < n / (2 * FFT::out_vector_size); ++j)
chess_prepare_for_pipelining
chess_loop_range(1,)
{
const auto out = fft.dit(*it_stage++, shift_tw,
shift,
inv);
*it_out0++ = out[0];
*it_out1++ = out[1];
}
}
cint16 cint16_t
Definition: types.hpp:104
cint32 cint32_t
Definition: types.hpp:105