X-Git-Url: https://git.stderr.nl/gitweb?p=matthijs%2Fprojects%2Fmontium-fft.git;a=blobdiff_plain;f=FFT.mc;h=a76377e1c70e3c26b829cd043dff8154b3699d29;hp=e58d2fa771e0c644297c99d1935d66dd8b1055ba;hb=HEAD;hpb=e09e88b193ec1b84311b345abb2b9d7943060602 diff --git a/FFT.mc b/FFT.mc index e58d2fa..a76377e 100644 --- a/FFT.mc +++ b/FFT.mc @@ -6,28 +6,20 @@ #include "FFT.h" - word in_a_re, in_a_im, in_b_re, in_b_im, in_W_re, in_W_im; - //out_a_re, out_a_im, out_b_re, out_b_im; - //mem input_a_re, input_a_im, input_b_re, input_b_im, output_a_re, output_a_im, output_b_re, output_b_im, twiddle_re, twiddle_im; - -void init() -{ - -} - - - -/*void update_gpi() { - set_gpi(0, 1); -}*/ - +/** + * Executes a single butterfly on ALU 0-3. The inputs are the words taken from + * in, which will be read on various inputs of ALU 0-3. Outputs will be + * returned and will we be available on the output ports of ALU 0 and 2. + */ INLINE struct bf_out butterfly(struct bf_in in) { struct bf_out out; /* ALU 0 & 1 */ /* im(W) * im(b) */ aluexp Wixbi = west(fmul(rd1(in.W_im), rb1(in.b_im))); + /* re(W * b) = re(W) * re(b) - im(W) * im(b) */ aluexp Wxbr = ssub_acc(fmul(rc1(in.W_re), ra1(in.b_re)), Wixbi); + /* re(out_a) = re(a) + re(W * b) */ out.a_re = p0o0(sadd_bf(rb1(in.a_re), Wxbr)); @@ -48,23 +40,55 @@ INLINE struct bf_out butterfly(struct bf_in in) { return out; } -INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_half) { - add_offset(m.output_a_re, 2); - add_offset(m.output_a_im, 2); - add_offset(m.output_b_re, 2); - add_offset(m.output_b_im, 2); - - if (second_half) { - write_mem(m.output_a_re, res.a_re); - write_mem(m.output_a_im, res.a_im); - write_mem(m.output_b_re, res.b_re); - write_mem(m.output_b_im, res.b_im); +/** + * Writes the output of a butterfly given in res to the correct memory + * locations. + * @param second_half Are we in the second half of the stage? + */ +INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_half, enum out_strategy out_s) { + if (out_s == REGULAR_OUT) { + /* Skip a memory on each cycle during the regular stages */ + add_offset(m.output_a_re, 2); + add_offset(m.output_a_im, 2); + add_offset(m.output_b_re, 2); + add_offset(m.output_b_im, 2); } else { - /* Write a results to memory b and v.v. */ + /* Simply write output linearly */ + add_offset(m.output_a_re, 1); + add_offset(m.output_a_im, 1); + add_offset(m.output_b_re, 1); + add_offset(m.output_b_im, 1); + } + if (out_s == BITREVERSED_OUT) { + /* + Use the memories (which are n_t - 1 bits wide) bitreversed. + Since we are generating the samples in sequence (0, 1, 2, 3, + ...) but are writing them to two different memories (0, 8, + 1, 9, ...) The last bit is already bitreversed, so in effect + we have fully bitreversed the results. Note that this holds + in the non-distributed case (Q = 1), but might also hold in + the distributed case (if the tile numbers are bitreversed + before concatenating memory). + */ + use_bitreverse(m.output_a_re, PARAM_n_t - 1); + use_bitreverse(m.output_a_im, PARAM_n_t - 1); + use_bitreverse(m.output_b_re, PARAM_n_t - 1); + use_bitreverse(m.output_b_im, PARAM_n_t - 1); + } + + if (out_s == REGULAR_OUT && second_half) { + /* When in the regular stages, reverse memory a and b during + * the second half */ write_mem(m.output_a_re, res.b_re); write_mem(m.output_a_im, res.b_im); write_mem(m.output_b_re, res.a_re); write_mem(m.output_b_im, res.a_im); + } else { + /* Simply write a to mem a and b to mem b */ + write_mem(m.output_a_re, res.a_re); + write_mem(m.output_a_im, res.a_im); + write_mem(m.output_b_re, res.b_re); + write_mem(m.output_b_im, res.b_im); } } @@ -78,9 +102,9 @@ INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_h * read input a from memory b and v.v. If not, * simply read a from memory a and b from memory b. */ -INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, bool stage_odd) { +INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, int stage) { struct bf_in in; - /* TODO: Select left or right memories */ + /* Swap memory a and b during the odd cycles */ if (cycle_odd) { in.a_re = read_mem(m.input_a_re); in.a_im = read_mem(m.input_a_im); @@ -95,22 +119,28 @@ INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, bool stage in.W_re = read_mem(m.twiddle_re); in.W_im = read_mem(m.twiddle_im); - + + /* Read inputs sequentially */ add_offset(m.input_a_re, 1); add_offset(m.input_a_im, 1); add_offset(m.input_b_re, 1); add_offset(m.input_b_im, 1); - /* TODO: Update twiddle offsets */ + + /* TODO: Is this true? */ + add_offset(m.twiddle_re, (PARAM_N_t>>stage)); + add_offset(m.twiddle_im, (PARAM_N_t>>stage)); + use_mask(m.twiddle_re, (PARAM_N_t/2)-1); + use_mask(m.twiddle_im, (PARAM_N_t/2)-1); + return in; } /** - * Initializes the addresses for the various memories. - * @param stage_odd True if this is an odd stage. - *@param second_half True if we are initing halfway a stage. + * Initializes the addresses for reading the inputs and twiddel factors. + * Should be called once at the start of each stage. */ -INLINE void init_input_addresses_regular(struct mems m, bool stage_odd) { - /* TODO: Select left or right memories */ +INLINE void init_input_addresses_regular(struct mems m) { + /* We simply start reading at address 0 incrementally */ set_base(m.input_a_im, 0); set_base(m.input_b_re, 0); set_base(m.input_b_im, 0); @@ -124,69 +154,118 @@ INLINE void init_input_addresses_regular(struct mems m, bool stage_odd) { set_offset(m.twiddle_re, 0); set_offset(m.twiddle_im, 0); } - - -INLINE void init_output_addresses_regular(struct mems m, bool stage_odd, bool second_half) { + +/** + * Initializes the addresses for reading the inputs. This function must be + * called twice per stage, since halfway the stage the addressing changes. + */ +INLINE void init_output_addresses_regular(struct mems m, bool second_half, enum out_strategy out_s) { + /* Only reset the memory addresses for the second half for the regular + * stages */ + if (out_s != REGULAR_OUT && second_half) + return; /* * For the second half of the stage, the starting addresses are * reversed. write_output_regular above will also swap the output * memories. * TODO: Better comments :-) */ - set_base(m.output_a_re, 0); set_base(m.output_a_im, 0); set_base(m.output_b_re, 0); set_base(m.output_b_im, 0); - - /* We subtract two from every address, since write_output_regular - * adds two to the offset before writing the first (and every other) - * result. */ - if (second_half) { - set_offset(m.output_a_re, 1-2); - set_offset(m.output_a_im, 1-2); - set_offset(m.output_b_re, 0-2); - set_offset(m.output_b_im, 0-2); + if (out_s == REGULAR_OUT) { + /* We subtract two from every address, since write_output_regular + * adds two to the offset before writing the first (and every other) + * result. */ + if (second_half) { + set_offset(m.output_a_re, 1-2); + set_offset(m.output_a_im, 1-2); + set_offset(m.output_b_re, 0-2); + set_offset(m.output_b_im, 0-2); + } else { + set_offset(m.output_a_re, 0-2); + set_offset(m.output_a_im, 0-2); + set_offset(m.output_b_re, 1-2); + set_offset(m.output_b_im, 1-2); + } } else { - set_offset(m.output_a_re, 1-2); - set_offset(m.output_a_im, 1-2); - set_offset(m.output_b_re, 0-2); - set_offset(m.output_b_im, 0-2); + /* Write sequentially, starting at 0 for both memories */ + set_offset(m.output_a_re, 0-1); + set_offset(m.output_a_im, 0-1); + set_offset(m.output_b_re, 0-1); + set_offset(m.output_b_im, 0-1); } } -INLINE void do_half_regular_stage(struct mems m, bool stage_odd, bool second_half){ - struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage_odd); +INLINE void do_half_regular_stage(struct mems m, int stage, bool second_half, enum in_strategy in_s, enum out_strategy out_s){ + /* + * We are doing two cycles in each iteration, so we can alternate the + * cycle_odd argument (which only works with constants, I don't expect + * the optimizer to do this loop unrolling for us). Since we need to + * write outputs before reading, but don't have any outputs to write + * in the first cycle, we must put the first cycle outside of the + * loop. Since the loop does two cycles at a time, this means there + * must be two cycles outside of the loop, so we put one at the end as + * well. Additionally, we also need to write the outputs of the last + * cycle in an extra cycle at the end. We probably can't combine this + * last cycle with the first cycle of the next stage, because they + * need the same memories (input becomes output and v.v.). + */ + + /* Initialize output addresses, this must be done twice per stage */ + init_output_addresses_regular(m, second_half, out_s); + + /* First cycle (no previous output to write) */ + struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage); struct bf_out out = butterfly(in); - - /* Now, do a single stage. That means N_t / 2 cycles. Since we do 2 + + /* Now, do half a single stage. That means N_t / 4 cycles. Since we do 2 * cycles on every iteration, plus one before and after the loop, - * we will loop N_t / 4 - 1 times. */ - init_loop(LC2, (N_t / 4) - 1); + * we will loop N_t / 8 - 1 times. We add an extra - 1 because this is a do while loop... */ + init_loop(LC2, (PARAM_N_t / 8) - 1 - 1); do { - init_output_addresses_regular(m, stage_odd, second_half); - write_output_regular(m, out, second_half); + /* Write outputs of previous cycle */ + write_output_regular(m, out, second_half, out_s); - in = read_input_regular(m, ODD_CYCLE, second_half); + /* Odd cycle */ + in = read_input_regular(m, ODD_CYCLE, stage); out = butterfly(in); next_cycle(); - write_output_regular(m, out, second_half); - in = read_input_regular(m, EVEN_CYCLE, second_half); + /* Write outputs of previous cycle */ + write_output_regular(m, out, second_half, out_s); + + /* Even cycle */ + in = read_input_regular(m, EVEN_CYCLE, stage); out = butterfly(in); } while (loop_next(LC2)); - write_output_regular(m, out, second_half); - in = read_input_regular(m, ODD_CYCLE, second_half); + /* Write outputs of previous cycle */ + write_output_regular(m, out, second_half, out_s); + + /* Last cycle */ + in = read_input_regular(m, ODD_CYCLE, stage); out = butterfly(in); + next_cycle(); + + /* Write outputs of last cycle */ + write_output_regular(m, out, second_half, out_s); + /* Force the next cycle, because the next stage must read from + * the memory we just wrote to */ next_cycle(); - write_output_regular(m, out, second_half); } -INLINE struct mems init_mem_mapping(bool stage_odd){ +/** + * Assign the input and output memories, based on the current stage. Also + * assigns the twiddle memories, but those are fixed. + */ +INLINE struct mems init_mem_mapping(int stage){ struct mems res; - if (stage_odd) { + /* Use left memories for input on odd (ie, first) + * stages and right memories on even stages. */ + if ((stage % 2) == 0) { res.input_a_re = alloc_mem(P0M1); res.input_a_im = alloc_mem(P1M1); res.input_b_re = alloc_mem(P2M1); @@ -211,24 +290,29 @@ INLINE struct mems init_mem_mapping(bool stage_odd){ return res; } + +INLINE void do_regular_stage(int stage, enum in_strategy in_s, enum out_strategy out_s) +{ + struct mems m = init_mem_mapping(stage); + init_input_addresses_regular(m); + /* do_half_regular_stage will init output addresses */ + next_cycle(); + do_half_regular_stage(m, stage, FIRST_HALF, in_s, out_s); + do_half_regular_stage(m, stage, SECOND_HALF, in_s, out_s); +} + void run() { -#ifdef __MONTIUMCC__ - /* main.cpp will call init before pre_run(), so only need to call init for MontiumCC */ - init(); -#endif do { freeze(); } while (gpi(0) == 0); - struct mems m; + + do_regular_stage(1, REGULAR_IN, REGULAR_OUT); + do_regular_stage(2, REGULAR_IN, REGULAR_OUT); + do_regular_stage(3, REGULAR_IN, REGULAR_OUT); + do_regular_stage(4, REGULAR_IN, BITREVERSED_OUT); - m = init_mem_mapping(EVEN_STAGE); - init_input_addresses_regular(m, EVEN_STAGE); - /* do_half_regular_stage will init output addresses */ - next_cycle(); - do_half_regular_stage(m, EVEN_STAGE, FIRST_HALF); - do_half_regular_stage(m, EVEN_STAGE, SECOND_HALF); + set_gpo(0); next_cycle(); - init_input_addresses_regular(m, ODD_STAGE); - m = init_mem_mapping(ODD_STAGE); + freeze(); + clear_gpo(0); next_cycle(); - do_half_regular_stage(m, ODD_STAGE, FIRST_HALF); - do_half_regular_stage(m, ODD_STAGE, SECOND_HALF); + freeze(); }