X-Git-Url: https://git.stderr.nl/gitweb?p=matthijs%2Fprojects%2Fmontium-fft.git;a=blobdiff_plain;f=FFT.mc;h=a76377e1c70e3c26b829cd043dff8154b3699d29;hp=e58d2fa771e0c644297c99d1935d66dd8b1055ba;hb=HEAD;hpb=e09e88b193ec1b84311b345abb2b9d7943060602

diff --git a/FFT.mc b/FFT.mc
index e58d2fa..a76377e 100644
--- a/FFT.mc
+++ b/FFT.mc
@@ -6,28 +6,20 @@
 
 #include "FFT.h"
 
-	word in_a_re, in_a_im, in_b_re, in_b_im, in_W_re, in_W_im;
-	//out_a_re, out_a_im, out_b_re, out_b_im;
-	//mem input_a_re, input_a_im, input_b_re, input_b_im, output_a_re, output_a_im, output_b_re, output_b_im, twiddle_re, twiddle_im; 
-
-void init()
-{
-
-}
-
-
-
-/*void update_gpi() {
-  set_gpi(0, 1);
-}*/
-
+/**
+ * Executes a single butterfly on ALU 0-3. The inputs are the words taken from
+ * in, which will be read on various inputs of ALU 0-3. Outputs will be
+ * returned and will we be available on the output ports of ALU 0 and 2.
+ */
 INLINE struct bf_out butterfly(struct bf_in in) {
 	struct bf_out out;
 	/* ALU 0 & 1 */
 		/* im(W) * im(b) */
 		aluexp Wixbi = west(fmul(rd1(in.W_im), rb1(in.b_im)));
+	
 		/* re(W * b) = re(W) * re(b) - im(W) * im(b) */
 		aluexp Wxbr  = ssub_acc(fmul(rc1(in.W_re), ra1(in.b_re)), Wixbi);
+
 		
 		/* re(out_a) = re(a) + re(W * b) */
 		out.a_re =   p0o0(sadd_bf(rb1(in.a_re), Wxbr));
@@ -48,23 +40,55 @@ INLINE struct bf_out butterfly(struct bf_in in) {
 		return out;
 }
 
-INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_half) {
-	add_offset(m.output_a_re, 2);
-	add_offset(m.output_a_im, 2);
-	add_offset(m.output_b_re, 2);
-	add_offset(m.output_b_im, 2);
-	
-	if (second_half) {
-		write_mem(m.output_a_re, res.a_re);
-		write_mem(m.output_a_im, res.a_im);
-		write_mem(m.output_b_re, res.b_re);
-		write_mem(m.output_b_im, res.b_im);
+/**
+ * Writes the output of a butterfly given in res to the correct memory
+ * locations.
+ * @param  second_half   Are we in the second half of the stage?
+ */
+INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_half, enum out_strategy out_s) {
+	if (out_s == REGULAR_OUT) {
+		/* Skip a memory on each cycle during the regular stages */
+		add_offset(m.output_a_re, 2);
+		add_offset(m.output_a_im, 2);
+		add_offset(m.output_b_re, 2);
+		add_offset(m.output_b_im, 2);
 	} else {
-		/* Write a results to memory b and v.v. */
+		/* Simply write output linearly */
+		add_offset(m.output_a_re, 1);
+		add_offset(m.output_a_im, 1);
+		add_offset(m.output_b_re, 1);
+		add_offset(m.output_b_im, 1);
+	}
+	if (out_s == BITREVERSED_OUT) {
+		/* 
+		  Use the memories (which are n_t - 1 bits wide) bitreversed.
+		  Since we are generating the samples in sequence (0, 1, 2, 3,
+		  ...) but are writing them to two different memories (0, 8,
+		  1, 9, ...) The last bit is already bitreversed, so in effect
+		  we have fully bitreversed the results. Note that this holds
+		  in the non-distributed case (Q = 1), but might also hold in
+		  the distributed case (if the tile numbers are bitreversed
+		  before concatenating memory).
+		*/
+		use_bitreverse(m.output_a_re, PARAM_n_t - 1);
+		use_bitreverse(m.output_a_im, PARAM_n_t - 1);
+		use_bitreverse(m.output_b_re, PARAM_n_t - 1);
+		use_bitreverse(m.output_b_im, PARAM_n_t - 1);
+	}
+	
+	if (out_s == REGULAR_OUT && second_half) {
+		/* When in the regular stages, reverse memory a and b during
+		 * the second half */
 		write_mem(m.output_a_re, res.b_re);
 		write_mem(m.output_a_im, res.b_im);
 		write_mem(m.output_b_re, res.a_re);
 		write_mem(m.output_b_im, res.a_im);
+	} else {
+		/* Simply write a to mem a and b to mem b */
+		write_mem(m.output_a_re, res.a_re);
+		write_mem(m.output_a_im, res.a_im);
+		write_mem(m.output_b_re, res.b_re);
+		write_mem(m.output_b_im, res.b_im);
 	}
 }
 
@@ -78,9 +102,9 @@ INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_h
  *                     read input a from memory b and v.v. If not, 
  *                     simply read a from memory a and b from memory b.
  */
-INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, bool stage_odd) {
+INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, int stage) {
 	struct bf_in in;
-	/* TODO: Select left or right memories */
+	 /* Swap memory a and b during the odd cycles */
 	if (cycle_odd) {
 		in.a_re = read_mem(m.input_a_re);
 		in.a_im = read_mem(m.input_a_im);
@@ -95,22 +119,28 @@ INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, bool stage
 	in.W_re = read_mem(m.twiddle_re);
 	in.W_im = read_mem(m.twiddle_im);
 	
-
+	
+	/* Read inputs sequentially */
 	add_offset(m.input_a_re, 1);
 	add_offset(m.input_a_im, 1);
 	add_offset(m.input_b_re, 1);
 	add_offset(m.input_b_im, 1);
-	/* TODO: Update twiddle offsets */
+	
+	/* TODO: Is this true? */
+	add_offset(m.twiddle_re, (PARAM_N_t>>stage));
+	add_offset(m.twiddle_im, (PARAM_N_t>>stage));
+	use_mask(m.twiddle_re, (PARAM_N_t/2)-1);
+	use_mask(m.twiddle_im, (PARAM_N_t/2)-1);
+
 	return in;
 }
 
 /**
- *  Initializes the addresses for the various memories.
- * @param stage_odd   True if this is an odd stage.
- *@param second_half True if we are initing halfway a stage.
+ * Initializes the addresses for reading the inputs and twiddel factors.
+ * Should be called once at the start of each stage.
  */ 
-INLINE void init_input_addresses_regular(struct mems m, bool stage_odd) {
-	/* TODO: Select left or right memories */
+INLINE void init_input_addresses_regular(struct mems m) {
+	/* We simply start reading at address 0 incrementally */
 	set_base(m.input_a_im, 0);
 	set_base(m.input_b_re, 0);
 	set_base(m.input_b_im, 0);
@@ -124,69 +154,118 @@ INLINE void init_input_addresses_regular(struct mems m, bool stage_odd) {
 	set_offset(m.twiddle_re, 0);
 	set_offset(m.twiddle_im, 0);
 }
-	
-	
-INLINE void init_output_addresses_regular(struct mems m, bool stage_odd, bool second_half) {
+
+/**
+ * Initializes the addresses for reading the inputs. This function must be
+ * called twice per stage, since halfway the stage the addressing changes.
+ */
+INLINE void init_output_addresses_regular(struct mems m, bool second_half, enum out_strategy out_s) {
+	/* Only reset the memory addresses for the second half for the regular
+	 * stages */
+	if (out_s != REGULAR_OUT && second_half)
+		return;
 	/* 
 	 * For the second half of the stage, the starting addresses are 
 	 * reversed. write_output_regular above will also swap the output
 	 * memories.
 	 * TODO: Better comments :-)
 	 */
-
 	set_base(m.output_a_re, 0);
 	set_base(m.output_a_im, 0);
 	set_base(m.output_b_re, 0);
 	set_base(m.output_b_im, 0);
-	
-	/* We subtract two from every address, since write_output_regular 
-	 * adds two to the offset before writing the first (and every other) 
-	 * result. */
-	if (second_half) {
-		set_offset(m.output_a_re, 1-2);
-		set_offset(m.output_a_im, 1-2);
-		set_offset(m.output_b_re, 0-2);
-		set_offset(m.output_b_im, 0-2);
+	if (out_s == REGULAR_OUT) {
+		/* We subtract two from every address, since write_output_regular 
+		 * adds two to the offset before writing the first (and every other) 
+		 * result. */
+		if (second_half) {
+			set_offset(m.output_a_re, 1-2);
+			set_offset(m.output_a_im, 1-2);
+			set_offset(m.output_b_re, 0-2);
+			set_offset(m.output_b_im, 0-2);
+		} else {
+			set_offset(m.output_a_re, 0-2);
+			set_offset(m.output_a_im, 0-2);
+			set_offset(m.output_b_re, 1-2);
+			set_offset(m.output_b_im, 1-2);
+		}
 	} else {
-		set_offset(m.output_a_re, 1-2);
-		set_offset(m.output_a_im, 1-2);
-		set_offset(m.output_b_re, 0-2);
-		set_offset(m.output_b_im, 0-2);
+		/* Write sequentially, starting at 0 for both memories */
+		set_offset(m.output_a_re, 0-1);
+		set_offset(m.output_a_im, 0-1);
+		set_offset(m.output_b_re, 0-1);
+		set_offset(m.output_b_im, 0-1);
 	}
 }
 
-INLINE void do_half_regular_stage(struct mems m, bool stage_odd, bool second_half){
-	struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage_odd);
+INLINE void do_half_regular_stage(struct mems m, int stage, bool second_half, enum in_strategy in_s, enum out_strategy out_s){
+	 /*
+	 * We are doing two cycles in each iteration, so we can alternate the
+	 * cycle_odd argument (which only works with constants, I don't expect
+	 * the optimizer to do this loop unrolling for us). Since we need to
+	 * write outputs before reading, but don't have any outputs to write
+	 * in the first cycle, we must put the first cycle outside of the
+	 * loop. Since the loop does two cycles at a time, this means there
+	 * must be two cycles outside of the loop, so we put one at the end as
+	 * well. Additionally, we also need to write the outputs of the last
+	 * cycle in an extra cycle at the end. We probably can't combine this
+	 * last cycle with the first cycle of the next stage, because they
+	 * need the same memories (input becomes output and v.v.).
+	 */
+
+	/* Initialize output addresses, this must be done twice per stage */
+	init_output_addresses_regular(m, second_half, out_s);
+
+	/* First cycle (no previous output to write) */
+	struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage);
 	struct bf_out out = butterfly(in);
-	
-	/* Now, do a single stage. That means N_t / 2 cycles. Since we do 2
+
+	/* Now, do half a single stage. That means N_t / 4 cycles. Since we do 2
 	 * cycles on every iteration, plus one before and after the loop,
-	 * we will loop N_t / 4 - 1 times. */
-	init_loop(LC2, (N_t / 4) - 1);
+	 * we will loop N_t / 8 - 1 times. We add an extra - 1 because this is a do while loop... */
+	init_loop(LC2, (PARAM_N_t / 8) - 1 - 1);
 	do {
-		init_output_addresses_regular(m, stage_odd, second_half);
-		write_output_regular(m, out, second_half);
+		/* Write outputs of previous cycle */
+		write_output_regular(m, out, second_half, out_s);
 
-		in = read_input_regular(m, ODD_CYCLE, second_half);
+		/* Odd cycle */
+		in = read_input_regular(m, ODD_CYCLE, stage);
 		out = butterfly(in);
 		next_cycle();
-		write_output_regular(m, out, second_half);
 
-		in = read_input_regular(m, EVEN_CYCLE, second_half);
+		/* Write outputs of previous cycle */
+		write_output_regular(m, out, second_half, out_s);
+
+		/* Even cycle */
+		in = read_input_regular(m, EVEN_CYCLE, stage);
 		out = butterfly(in);
 	} while (loop_next(LC2));
 	
-	write_output_regular(m, out, second_half);
-	in = read_input_regular(m, ODD_CYCLE, second_half);
+	/* Write outputs of previous cycle */
+	write_output_regular(m, out, second_half, out_s);
+
+	/* Last cycle */
+	in = read_input_regular(m, ODD_CYCLE, stage);
 	out = butterfly(in);
+	next_cycle();
+
+	/* Write outputs of last cycle */
+	write_output_regular(m, out, second_half, out_s);
 	
+	/* Force the next cycle, because the next stage must read from
+	 * the memory we just wrote to */
 	next_cycle();
-	write_output_regular(m, out, second_half);
 }
 
-INLINE struct mems init_mem_mapping(bool stage_odd){
+/**
+ * Assign the input and output memories, based on the current stage. Also 
+ * assigns the twiddle memories, but those are fixed.
+ */
+INLINE struct mems init_mem_mapping(int stage){
 	struct mems res;
-	if (stage_odd) {
+	/* Use left memories for input on odd (ie, first) 
+	 * stages and right memories on even stages. */
+	if ((stage % 2) == 0) {
 		res.input_a_re   = alloc_mem(P0M1);
 		res.input_a_im   = alloc_mem(P1M1);
 		res.input_b_re   = alloc_mem(P2M1);
@@ -211,24 +290,29 @@ INLINE struct mems init_mem_mapping(bool stage_odd){
 	
 	return res;
 }
+
+INLINE void do_regular_stage(int stage, enum in_strategy in_s, enum out_strategy out_s)
+{
+	struct mems m = init_mem_mapping(stage);
+	init_input_addresses_regular(m);
+	/* do_half_regular_stage will init output addresses */
+	next_cycle();
+	do_half_regular_stage(m, stage, FIRST_HALF, in_s, out_s);
+	do_half_regular_stage(m, stage, SECOND_HALF, in_s, out_s);
+}
+
 void run() {
-#ifdef __MONTIUMCC__
-	/* main.cpp will call init before pre_run(), so only need to call init for MontiumCC */
-	init();
-#endif
 	do { freeze(); } while (gpi(0) == 0);
-	struct mems m;
+
+	do_regular_stage(1, REGULAR_IN, REGULAR_OUT);
+	do_regular_stage(2, REGULAR_IN, REGULAR_OUT);
+	do_regular_stage(3, REGULAR_IN, REGULAR_OUT);
+	do_regular_stage(4, REGULAR_IN, BITREVERSED_OUT);
 	
-	m = init_mem_mapping(EVEN_STAGE);
-	init_input_addresses_regular(m, EVEN_STAGE);
-	/* do_half_regular_stage will init output addresses */
-	next_cycle();
-	do_half_regular_stage(m, EVEN_STAGE, FIRST_HALF);
-	do_half_regular_stage(m, EVEN_STAGE, SECOND_HALF);
+	set_gpo(0);
 	next_cycle();
-	init_input_addresses_regular(m, ODD_STAGE);
-	m = init_mem_mapping(ODD_STAGE);
+	freeze();
+	clear_gpo(0);
 	next_cycle();
-	do_half_regular_stage(m, ODD_STAGE, FIRST_HALF);
-	do_half_regular_stage(m, ODD_STAGE, SECOND_HALF);
+	freeze();
 }