FFT.mc

   1 #ifndef __MONTIUMCC__\r
   2 #include <memory.h>\r
   3 #include <math.h>\r
   4 #include <stdio.h>\r
   5 #endif\r
   6 \r
   7 #include "FFT.h"\r
   8 \r
   9 /**\r
  10  * Executes a single butterfly on ALU 0-3. The inputs are the words taken from\r
  11  * in, which will be read on various inputs of ALU 0-3. Outputs will be\r
  12  * returned and will we be available on the output ports of ALU 0 and 2.\r
  13  */\r
  14 INLINE struct bf_out butterfly(struct bf_in in) {\r
  15         struct bf_out out;\r
  16         /* ALU 0 & 1 */\r
  17                 /* im(W) * im(b) */\r
  18                 aluexp Wixbi = west(fmul(rd1(in.W_im), rb1(in.b_im)));\r
  19         \r
  20                 /* re(W * b) = re(W) * re(b) - im(W) * im(b) */\r
  21                 aluexp Wxbr  = ssub_acc(fmul(rc1(in.W_re), ra1(in.b_re)), Wixbi);\r
  22 \r
  23                 \r
  24                 /* re(out_a) = re(a) + re(W * b) */\r
  25                 out.a_re =   p0o0(sadd_bf(rb1(in.a_re), Wxbr));\r
  26                 /* re(out_b) = re(a) - re(W * b) */\r
  27                 out.b_re = p0o1(ssub_bf(rb1(in.a_re), Wxbr));\r
  28                 \r
  29         /* ALU 2 & 3 */ \r
  30                 /* im(W) * re(b) */\r
  31                 aluexp Wixbr = west(fmul(rd1(in.W_im), rb1(in.b_re)));\r
  32                 /* im(W * b) = re(W) * im(b) + im(W) * re(b) */\r
  33                 aluexp Wxbi  = sadd_acc(fmul(rc1(in.W_re), ra1(in.b_im)), Wixbr);\r
  34                 \r
  35                 /* im(out_a) = im(a) + im(W * b) */\r
  36                 out.a_im =   p2o0(sadd_bf(rb1(in.a_im), Wxbi));\r
  37                 /* im(out_b) = im(a) - im(W * b) */\r
  38                 out.b_im = p2o1(ssub_bf(rb1(in.a_im), Wxbi));\r
  39                 \r
  40                 return out;\r
  41 }\r
  42 \r
  43 /**\r
  44  * Writes the output of a butterfly given in res to the correct memory\r
  45  * locations.\r
  46  * @param  second_half   Are we in the second half of the stage?\r
  47  */\r
  48 INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_half) {\r
  49         add_offset(m.output_a_re, 2);\r
  50         add_offset(m.output_a_im, 2);\r
  51         add_offset(m.output_b_re, 2);\r
  52         add_offset(m.output_b_im, 2);\r
  53         \r
  54         if (second_half) {\r
  55                 write_mem(m.output_a_re, res.a_re);\r
  56                 write_mem(m.output_a_im, res.a_im);\r
  57                 write_mem(m.output_b_re, res.b_re);\r
  58                 write_mem(m.output_b_im, res.b_im);\r
  59         } else {\r
  60                 /* Write a results to memory b and v.v. */\r
  61                 write_mem(m.output_a_re, res.b_re);\r
  62                 write_mem(m.output_a_im, res.b_im);\r
  63                 write_mem(m.output_b_re, res.a_re);\r
  64                 write_mem(m.output_b_im, res.a_im);\r
  65         }\r
  66 }\r
  67 \r
  68 /**\r
  69  * Reads four inputs and two twiddle factors from memory. \r
  70  * Also increases memory offsets by 1 after reading.\r
  71  * @param    stage_odd Is this an odd stage? If so, read from the left\r
  72  *                     memories, else read from the right memories\r
  73  *                     (not implemented yet).\r
  74  * @param    cycle_odd Is this an odd cycle within the stage? If so, \r
  75  *                     read input a from memory b and v.v. If not, \r
  76  *                     simply read a from memory a and b from memory b.\r
  77  */\r
  78 INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, bool stage_odd) {\r
  79         struct bf_in in;\r
  80          /* Swap memory a and b during the odd cycles */\r
  81         if (cycle_odd) {\r
  82                 in.a_re = read_mem(m.input_a_re);\r
  83                 in.a_im = read_mem(m.input_a_im);\r
  84                 in.b_re = read_mem(m.input_b_re);               \r
  85                 in.b_im = read_mem(m.input_b_im);\r
  86         } else {\r
  87                 in.b_re = read_mem(m.input_a_re);\r
  88                 in.b_im = read_mem(m.input_a_im);\r
  89                 in.a_re = read_mem(m.input_b_re);               \r
  90                 in.a_im = read_mem(m.input_b_im);\r
  91         }\r
  92         in.W_re = read_mem(m.twiddle_re);\r
  93         in.W_im = read_mem(m.twiddle_im);\r
  94         \r
  95         \r
  96         /* Read inputs sequentially */\r
  97         add_offset(m.input_a_re, 1);\r
  98         add_offset(m.input_a_im, 1);\r
  99         add_offset(m.input_b_re, 1);\r
 100         add_offset(m.input_b_im, 1);\r
 101         /* TODO: Update twiddle offsets */\r
 102         return in;\r
 103 }\r
 104 \r
 105 /**\r
 106  * Initializes the addresses for writing the outputs.\r
 107  * @param stage_odd   True if this is an odd stage.\r
 108  * @param second_half True if we are initing halfway a stage.\r
 109  */ \r
 110 INLINE void init_input_addresses_regular(struct mems m, bool stage_odd) {\r
 111         /* We simply start reading at address 0 incrementally */\r
 112         set_base(m.input_a_im, 0);\r
 113         set_base(m.input_b_re, 0);\r
 114         set_base(m.input_b_im, 0);\r
 115         set_base(m.twiddle_re, 0);\r
 116         set_base(m.twiddle_im, 0);\r
 117         \r
 118         set_offset(m.input_a_re, 0);\r
 119         set_offset(m.input_a_im, 0);\r
 120         set_offset(m.input_b_re, 0);\r
 121         set_offset(m.input_b_im, 0);\r
 122         set_offset(m.twiddle_re, 0);\r
 123         set_offset(m.twiddle_im, 0);\r
 124 }\r
 125 \r
 126 /**\r
 127  * Initializes the addresses for reading the inputs. This function must be\r
 128  * called twice per stage, since halfway the stage the addressing changes.\r
 129  */\r
 130 INLINE void init_output_addresses_regular(struct mems m, bool stage_odd, bool second_half) {\r
 131         /* \r
 132          * For the second half of the stage, the starting addresses are \r
 133          * reversed. write_output_regular above will also swap the output\r
 134          * memories.\r
 135          * TODO: Better comments :-)\r
 136          */\r
 137 \r
 138         set_base(m.output_a_re, 0);\r
 139         set_base(m.output_a_im, 0);\r
 140         set_base(m.output_b_re, 0);\r
 141         set_base(m.output_b_im, 0);\r
 142         \r
 143         /* We subtract two from every address, since write_output_regular \r
 144          * adds two to the offset before writing the first (and every other) \r
 145          * result. */\r
 146         if (second_half) {\r
 147                 set_offset(m.output_a_re, 1-2);\r
 148                 set_offset(m.output_a_im, 1-2);\r
 149                 set_offset(m.output_b_re, 0-2);\r
 150                 set_offset(m.output_b_im, 0-2);\r
 151         } else {\r
 152                 set_offset(m.output_a_re, 0-2);\r
 153                 set_offset(m.output_a_im, 0-2);\r
 154                 set_offset(m.output_b_re, 1-2);\r
 155                 set_offset(m.output_b_im, 1-2);\r
 156         }\r
 157 }\r
 158 \r
 159 INLINE void do_half_regular_stage(struct mems m, bool stage_odd, bool second_half){\r
 160          /*\r
 161          * We are doing two cycles in each iteration, so we can alternate the\r
 162          * cycle_odd argument (which only works with constants, I don't expect\r
 163          * the optimizer to do this loop unrolling for us). Since we need to\r
 164          * write outputs before reading, but don't have any outputs to write\r
 165          * in the first cycle, we must put the first cycle outside of the\r
 166          * loop. Since the loop does two cycles at a time, this means there\r
 167          * must be two cycles outside of the loop, so we put one at the end as\r
 168          * well. Additionally, we also need to write the outputs of the last\r
 169          * cycle in an extra cycle at the end. We probably can't combine this\r
 170          * last cycle with the first cycle of the next stage, because they\r
 171          * need the same memories (input becomes output and v.v.).\r
 172          */\r
 173 \r
 174         /* Initialize output addresses, this must be done twice per stage */\r
 175         init_output_addresses_regular(m, stage_odd, second_half);\r
 176 \r
 177         /* First cycle (no previous output to write) */\r
 178         struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage_odd);\r
 179         struct bf_out out = butterfly(in);\r
 180 \r
 181         /* Now, do half a single stage. That means N_t / 4 cycles. Since we do 2\r
 182          * cycles on every iteration, plus one before and after the loop,\r
 183          * we will loop N_t / 8 - 1 times. */\r
 184         init_loop(LC2, (PARAM_N_t / 8) - 1);\r
 185         do {\r
 186                 /* Write outputs of previous cycle */\r
 187                 write_output_regular(m, out, second_half);\r
 188 \r
 189                 /* Odd cycle */\r
 190                 in = read_input_regular(m, ODD_CYCLE, second_half);\r
 191                 out = butterfly(in);\r
 192                 next_cycle();\r
 193 \r
 194                 /* Write outputs of previous cycle */\r
 195                 write_output_regular(m, out, second_half);\r
 196 \r
 197                 /* Even cycle */\r
 198                 in = read_input_regular(m, EVEN_CYCLE, second_half);\r
 199                 out = butterfly(in);\r
 200         } while (loop_next(LC2));\r
 201         \r
 202         /* Write outputs of previous cycle */\r
 203         write_output_regular(m, out, second_half);\r
 204 \r
 205         /* Last cycle */\r
 206         in = read_input_regular(m, ODD_CYCLE, second_half);\r
 207         out = butterfly(in);\r
 208         next_cycle();\r
 209 \r
 210         /* Write outputs of last cycle */\r
 211         write_output_regular(m, out, second_half);\r
 212         \r
 213         /* Force the next cycle, because the next stage must read from\r
 214          * the memory we just wrote to */\r
 215         next_cycle();\r
 216 }\r
 217 \r
 218 INLINE struct mems init_mem_mapping(bool stage_odd){\r
 219         struct mems res;\r
 220         if (stage_odd) {\r
 221                 res.input_a_re   = alloc_mem(P0M1);\r
 222                 res.input_a_im   = alloc_mem(P1M1);\r
 223                 res.input_b_re   = alloc_mem(P2M1);\r
 224                 res.input_b_im   = alloc_mem(P3M1);\r
 225                 res.output_a_re  = alloc_mem(P0M0);\r
 226                 res.output_a_im  = alloc_mem(P1M0);\r
 227                 res.output_b_re  = alloc_mem(P2M0);\r
 228                 res.output_b_im  = alloc_mem(P3M0);\r
 229         } else {\r
 230                 res.input_a_re   = alloc_mem(P0M0);\r
 231                 res.input_a_im   = alloc_mem(P1M0);\r
 232                 res.input_b_re   = alloc_mem(P2M0);\r
 233                 res.input_b_im   = alloc_mem(P3M0);\r
 234                 res.output_a_re  = alloc_mem(P0M1);\r
 235                 res.output_a_im  = alloc_mem(P1M1);\r
 236                 res.output_b_re  = alloc_mem(P2M1);\r
 237                 res.output_b_im  = alloc_mem(P3M1);\r
 238         }\r
 239         \r
 240         res.twiddle_re   = alloc_mem(P4M0);\r
 241         res.twiddle_im   = alloc_mem(P4M1);\r
 242         \r
 243         return res;\r
 244 }\r
 245 void run() {\r
 246         do { freeze(); } while (gpi(0) == 0);\r
 247         struct mems m;\r
 248 \r
 249         /* We need to do n_t regular stages. Since we do two stages each\r
 250          * iteration, we'll do n_t / 2 iterations. */\r
 251         init_loop(LC1, (PARAM_n_t / 2));\r
 252         do {\r
 253                 m = init_mem_mapping(EVEN_STAGE);\r
 254                 init_input_addresses_regular(m, EVEN_STAGE);\r
 255                 /* do_half_regular_stage will init output addresses */\r
 256                 next_cycle();\r
 257                 do_half_regular_stage(m, EVEN_STAGE, FIRST_HALF);\r
 258                 do_half_regular_stage(m, EVEN_STAGE, SECOND_HALF);\r
 259                 next_cycle();\r
 260                 init_input_addresses_regular(m, ODD_STAGE);\r
 261                 m = init_mem_mapping(ODD_STAGE);\r
 262                 next_cycle();\r
 263                 do_half_regular_stage(m, ODD_STAGE, FIRST_HALF);\r
 264                 do_half_regular_stage(m, ODD_STAGE, SECOND_HALF);\r
 265         } while (loop_next(LC1));\r
 266 }\r