#ifndef __MONTIUMCC__
#include <memory.h>
#include <math.h>
#include <stdio.h>
#endif

#include "FFT.h"

/**
 * Executes a single butterfly on ALU 0-3. The inputs are the words taken from
 * in, which will be read on various inputs of ALU 0-3. Outputs will be
 * returned and will we be available on the output ports of ALU 0 and 2.
 */
INLINE struct bf_out butterfly(struct bf_in in) {
	struct bf_out out;
	/* ALU 0 & 1 */
		/* im(W) * im(b) */
		aluexp Wixbi = west(fmul(rd1(in.W_im), rb1(in.b_im)));
	
		/* re(W * b) = re(W) * re(b) - im(W) * im(b) */
		aluexp Wxbr  = ssub_acc(fmul(rc1(in.W_re), ra1(in.b_re)), Wixbi);

		
		/* re(out_a) = re(a) + re(W * b) */
		out.a_re =   p0o0(sadd_bf(rb1(in.a_re), Wxbr));
		/* re(out_b) = re(a) - re(W * b) */
		out.b_re = p0o1(ssub_bf(rb1(in.a_re), Wxbr));
		
	/* ALU 2 & 3 */	
		/* im(W) * re(b) */
		aluexp Wixbr = west(fmul(rd1(in.W_im), rb1(in.b_re)));
		/* im(W * b) = re(W) * im(b) + im(W) * re(b) */
		aluexp Wxbi  = sadd_acc(fmul(rc1(in.W_re), ra1(in.b_im)), Wixbr);
		
		/* im(out_a) = im(a) + im(W * b) */
		out.a_im =   p2o0(sadd_bf(rb1(in.a_im), Wxbi));
		/* im(out_b) = im(a) - im(W * b) */
		out.b_im = p2o1(ssub_bf(rb1(in.a_im), Wxbi));
		
		return out;
}

/**
 * Writes the output of a butterfly given in res to the correct memory
 * locations.
 * @param  second_half   Are we in the second half of the stage?
 */
INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_half, enum out_strategy out_s) {
	if (out_s == REGULAR_OUT) {
		/* Skip a memory on each cycle during the regular stages */
		add_offset(m.output_a_re, 2);
		add_offset(m.output_a_im, 2);
		add_offset(m.output_b_re, 2);
		add_offset(m.output_b_im, 2);
	} else {
		/* Simply write output linearly */
		add_offset(m.output_a_re, 1);
		add_offset(m.output_a_im, 1);
		add_offset(m.output_b_re, 1);
		add_offset(m.output_b_im, 1);
	}
	if (out_s == BITREVERSED_OUT) {
		/* 
		  Use the memories (which are n_t - 1 bits wide) bitreversed.
		  Since we are generating the samples in sequence (0, 1, 2, 3,
		  ...) but are writing them to two different memories (0, 8,
		  1, 9, ...) The last bit is already bitreversed, so in effect
		  we have fully bitreversed the results. Note that this holds
		  in the non-distributed case (Q = 1), but might also hold in
		  the distributed case (if the tile numbers are bitreversed
		  before concatenating memory).
		*/
		use_bitreverse(m.output_a_re, PARAM_n_t - 1);
		use_bitreverse(m.output_a_im, PARAM_n_t - 1);
		use_bitreverse(m.output_b_re, PARAM_n_t - 1);
		use_bitreverse(m.output_b_im, PARAM_n_t - 1);
	}
	
	if (out_s == REGULAR_OUT && second_half) {
		/* When in the regular stages, reverse memory a and b during
		 * the second half */
		write_mem(m.output_a_re, res.b_re);
		write_mem(m.output_a_im, res.b_im);
		write_mem(m.output_b_re, res.a_re);
		write_mem(m.output_b_im, res.a_im);
	} else {
		/* Simply write a to mem a and b to mem b */
		write_mem(m.output_a_re, res.a_re);
		write_mem(m.output_a_im, res.a_im);
		write_mem(m.output_b_re, res.b_re);
		write_mem(m.output_b_im, res.b_im);
	}
}

/**
 * Reads four inputs and two twiddle factors from memory. 
 * Also increases memory offsets by 1 after reading.
 * @param    stage_odd Is this an odd stage? If so, read from the left
 *                     memories, else read from the right memories
 *                     (not implemented yet).
 * @param    cycle_odd Is this an odd cycle within the stage? If so, 
 *                     read input a from memory b and v.v. If not, 
 *                     simply read a from memory a and b from memory b.
 */
INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, int stage) {
	struct bf_in in;
	 /* Swap memory a and b during the odd cycles */
	if (cycle_odd) {
		in.a_re = read_mem(m.input_a_re);
		in.a_im = read_mem(m.input_a_im);
		in.b_re = read_mem(m.input_b_re);		
		in.b_im = read_mem(m.input_b_im);
	} else {
		in.b_re = read_mem(m.input_a_re);
		in.b_im = read_mem(m.input_a_im);
		in.a_re = read_mem(m.input_b_re);		
		in.a_im = read_mem(m.input_b_im);
	}
	in.W_re = read_mem(m.twiddle_re);
	in.W_im = read_mem(m.twiddle_im);
	
	
	/* Read inputs sequentially */
	add_offset(m.input_a_re, 1);
	add_offset(m.input_a_im, 1);
	add_offset(m.input_b_re, 1);
	add_offset(m.input_b_im, 1);
	
	/* TODO: Is this true? */
	add_offset(m.twiddle_re, (PARAM_N_t>>stage));
	add_offset(m.twiddle_im, (PARAM_N_t>>stage));
	use_mask(m.twiddle_re, (PARAM_N_t/2)-1);
	use_mask(m.twiddle_im, (PARAM_N_t/2)-1);

	return in;
}

/**
 * Initializes the addresses for reading the inputs and twiddel factors.
 * Should be called once at the start of each stage.
 */ 
INLINE void init_input_addresses_regular(struct mems m) {
	/* We simply start reading at address 0 incrementally */
	set_base(m.input_a_im, 0);
	set_base(m.input_b_re, 0);
	set_base(m.input_b_im, 0);
	set_base(m.twiddle_re, 0);
	set_base(m.twiddle_im, 0);
	
	set_offset(m.input_a_re, 0);
	set_offset(m.input_a_im, 0);
	set_offset(m.input_b_re, 0);
	set_offset(m.input_b_im, 0);
	set_offset(m.twiddle_re, 0);
	set_offset(m.twiddle_im, 0);
}

/**
 * Initializes the addresses for reading the inputs. This function must be
 * called twice per stage, since halfway the stage the addressing changes.
 */
INLINE void init_output_addresses_regular(struct mems m, bool second_half, enum out_strategy out_s) {
	/* Only reset the memory addresses for the second half for the regular
	 * stages */
	if (out_s != REGULAR_OUT && second_half)
		return;
	/* 
	 * For the second half of the stage, the starting addresses are 
	 * reversed. write_output_regular above will also swap the output
	 * memories.
	 * TODO: Better comments :-)
	 */
	set_base(m.output_a_re, 0);
	set_base(m.output_a_im, 0);
	set_base(m.output_b_re, 0);
	set_base(m.output_b_im, 0);
	if (out_s == REGULAR_OUT) {
		/* We subtract two from every address, since write_output_regular 
		 * adds two to the offset before writing the first (and every other) 
		 * result. */
		if (second_half) {
			set_offset(m.output_a_re, 1-2);
			set_offset(m.output_a_im, 1-2);
			set_offset(m.output_b_re, 0-2);
			set_offset(m.output_b_im, 0-2);
		} else {
			set_offset(m.output_a_re, 0-2);
			set_offset(m.output_a_im, 0-2);
			set_offset(m.output_b_re, 1-2);
			set_offset(m.output_b_im, 1-2);
		}
	} else {
		/* Write sequentially, starting at 0 for both memories */
		set_offset(m.output_a_re, 0-1);
		set_offset(m.output_a_im, 0-1);
		set_offset(m.output_b_re, 0-1);
		set_offset(m.output_b_im, 0-1);
	}
}

INLINE void do_half_regular_stage(struct mems m, int stage, bool second_half, enum in_strategy in_s, enum out_strategy out_s){
	 /*
	 * We are doing two cycles in each iteration, so we can alternate the
	 * cycle_odd argument (which only works with constants, I don't expect
	 * the optimizer to do this loop unrolling for us). Since we need to
	 * write outputs before reading, but don't have any outputs to write
	 * in the first cycle, we must put the first cycle outside of the
	 * loop. Since the loop does two cycles at a time, this means there
	 * must be two cycles outside of the loop, so we put one at the end as
	 * well. Additionally, we also need to write the outputs of the last
	 * cycle in an extra cycle at the end. We probably can't combine this
	 * last cycle with the first cycle of the next stage, because they
	 * need the same memories (input becomes output and v.v.).
	 */

	/* Initialize output addresses, this must be done twice per stage */
	init_output_addresses_regular(m, second_half, out_s);

	/* First cycle (no previous output to write) */
	struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage);
	struct bf_out out = butterfly(in);

	/* Now, do half a single stage. That means N_t / 4 cycles. Since we do 2
	 * cycles on every iteration, plus one before and after the loop,
	 * we will loop N_t / 8 - 1 times. We add an extra - 1 because this is a do while loop... */
	init_loop(LC2, (PARAM_N_t / 8) - 1 - 1);
	do {
		/* Write outputs of previous cycle */
		write_output_regular(m, out, second_half, out_s);

		/* Odd cycle */
		in = read_input_regular(m, ODD_CYCLE, stage);
		out = butterfly(in);
		next_cycle();

		/* Write outputs of previous cycle */
		write_output_regular(m, out, second_half, out_s);

		/* Even cycle */
		in = read_input_regular(m, EVEN_CYCLE, stage);
		out = butterfly(in);
	} while (loop_next(LC2));
	
	/* Write outputs of previous cycle */
	write_output_regular(m, out, second_half, out_s);

	/* Last cycle */
	in = read_input_regular(m, ODD_CYCLE, stage);
	out = butterfly(in);
	next_cycle();

	/* Write outputs of last cycle */
	write_output_regular(m, out, second_half, out_s);
	
	/* Force the next cycle, because the next stage must read from
	 * the memory we just wrote to */
	next_cycle();
}

/**
 * Assign the input and output memories, based on the current stage. Also 
 * assigns the twiddle memories, but those are fixed.
 */
INLINE struct mems init_mem_mapping(int stage){
	struct mems res;
	/* Use left memories for input on odd (ie, first) 
	 * stages and right memories on even stages. */
	if ((stage % 2) == 0) {
		res.input_a_re   = alloc_mem(P0M1);
		res.input_a_im   = alloc_mem(P1M1);
		res.input_b_re   = alloc_mem(P2M1);
		res.input_b_im   = alloc_mem(P3M1);
		res.output_a_re  = alloc_mem(P0M0);
		res.output_a_im  = alloc_mem(P1M0);
		res.output_b_re  = alloc_mem(P2M0);
		res.output_b_im  = alloc_mem(P3M0);
	} else {
		res.input_a_re   = alloc_mem(P0M0);
		res.input_a_im   = alloc_mem(P1M0);
		res.input_b_re   = alloc_mem(P2M0);
		res.input_b_im   = alloc_mem(P3M0);
		res.output_a_re  = alloc_mem(P0M1);
		res.output_a_im  = alloc_mem(P1M1);
		res.output_b_re  = alloc_mem(P2M1);
		res.output_b_im  = alloc_mem(P3M1);
	}
	
	res.twiddle_re   = alloc_mem(P4M0);
	res.twiddle_im   = alloc_mem(P4M1);
	
	return res;
}

INLINE void do_regular_stage(int stage, enum in_strategy in_s, enum out_strategy out_s)
{
	struct mems m = init_mem_mapping(stage);
	init_input_addresses_regular(m);
	/* do_half_regular_stage will init output addresses */
	next_cycle();
	do_half_regular_stage(m, stage, FIRST_HALF, in_s, out_s);
	do_half_regular_stage(m, stage, SECOND_HALF, in_s, out_s);
}

void run() {
	do { freeze(); } while (gpi(0) == 0);

	do_regular_stage(1, REGULAR_IN, REGULAR_OUT);
	do_regular_stage(2, REGULAR_IN, REGULAR_OUT);
	do_regular_stage(3, REGULAR_IN, REGULAR_OUT);
	do_regular_stage(4, REGULAR_IN, BITREVERSED_OUT);
	
	set_gpo(0);
	next_cycle();
	freeze();
	clear_gpo(0);
	next_cycle();
	freeze();
}