1 #ifndef __MONTIUMCC__
\r
10 * Executes a single butterfly on ALU 0-3. The inputs are the words taken from
\r
11 * in, which will be read on various inputs of ALU 0-3. Outputs will be
\r
12 * returned and will we be available on the output ports of ALU 0 and 2.
\r
14 INLINE struct bf_out butterfly(struct bf_in in) {
\r
18 aluexp Wixbi = west(fmul(rd1(in.W_im), rb1(in.b_im)));
\r
20 /* re(W * b) = re(W) * re(b) - im(W) * im(b) */
\r
21 aluexp Wxbr = ssub_acc(fmul(rc1(in.W_re), ra1(in.b_re)), Wixbi);
\r
24 /* re(out_a) = re(a) + re(W * b) */
\r
25 out.a_re = p0o0(sadd_bf(rb1(in.a_re), Wxbr));
\r
26 /* re(out_b) = re(a) - re(W * b) */
\r
27 out.b_re = p0o1(ssub_bf(rb1(in.a_re), Wxbr));
\r
31 aluexp Wixbr = west(fmul(rd1(in.W_im), rb1(in.b_re)));
\r
32 /* im(W * b) = re(W) * im(b) + im(W) * re(b) */
\r
33 aluexp Wxbi = sadd_acc(fmul(rc1(in.W_re), ra1(in.b_im)), Wixbr);
\r
35 /* im(out_a) = im(a) + im(W * b) */
\r
36 out.a_im = p2o0(sadd_bf(rb1(in.a_im), Wxbi));
\r
37 /* im(out_b) = im(a) - im(W * b) */
\r
38 out.b_im = p2o1(ssub_bf(rb1(in.a_im), Wxbi));
\r
44 * Writes the output of a butterfly given in res to the correct memory
\r
46 * @param second_half Are we in the second half of the stage?
\r
48 INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_half) {
\r
49 add_offset(m.output_a_re, 2);
\r
50 add_offset(m.output_a_im, 2);
\r
51 add_offset(m.output_b_re, 2);
\r
52 add_offset(m.output_b_im, 2);
\r
55 write_mem(m.output_a_re, res.a_re);
\r
56 write_mem(m.output_a_im, res.a_im);
\r
57 write_mem(m.output_b_re, res.b_re);
\r
58 write_mem(m.output_b_im, res.b_im);
\r
60 /* Write a results to memory b and v.v. */
\r
61 write_mem(m.output_a_re, res.b_re);
\r
62 write_mem(m.output_a_im, res.b_im);
\r
63 write_mem(m.output_b_re, res.a_re);
\r
64 write_mem(m.output_b_im, res.a_im);
\r
69 * Reads four inputs and two twiddle factors from memory.
\r
70 * Also increases memory offsets by 1 after reading.
\r
71 * @param stage_odd Is this an odd stage? If so, read from the left
\r
72 * memories, else read from the right memories
\r
73 * (not implemented yet).
\r
74 * @param cycle_odd Is this an odd cycle within the stage? If so,
\r
75 * read input a from memory b and v.v. If not,
\r
76 * simply read a from memory a and b from memory b.
\r
78 INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, bool stage_odd) {
\r
80 /* Swap memory a and b during the odd cycles */
\r
82 in.a_re = read_mem(m.input_a_re);
\r
83 in.a_im = read_mem(m.input_a_im);
\r
84 in.b_re = read_mem(m.input_b_re);
\r
85 in.b_im = read_mem(m.input_b_im);
\r
87 in.b_re = read_mem(m.input_a_re);
\r
88 in.b_im = read_mem(m.input_a_im);
\r
89 in.a_re = read_mem(m.input_b_re);
\r
90 in.a_im = read_mem(m.input_b_im);
\r
92 in.W_re = read_mem(m.twiddle_re);
\r
93 in.W_im = read_mem(m.twiddle_im);
\r
96 /* Read inputs sequentially */
\r
97 add_offset(m.input_a_re, 1);
\r
98 add_offset(m.input_a_im, 1);
\r
99 add_offset(m.input_b_re, 1);
\r
100 add_offset(m.input_b_im, 1);
\r
101 /* TODO: Update twiddle offsets */
\r
106 * Initializes the addresses for writing the outputs.
\r
107 * @param stage_odd True if this is an odd stage.
\r
108 * @param second_half True if we are initing halfway a stage.
\r
110 INLINE void init_input_addresses_regular(struct mems m, bool stage_odd) {
\r
111 /* We simply start reading at address 0 incrementally */
\r
112 set_base(m.input_a_im, 0);
\r
113 set_base(m.input_b_re, 0);
\r
114 set_base(m.input_b_im, 0);
\r
115 set_base(m.twiddle_re, 0);
\r
116 set_base(m.twiddle_im, 0);
\r
118 set_offset(m.input_a_re, 0);
\r
119 set_offset(m.input_a_im, 0);
\r
120 set_offset(m.input_b_re, 0);
\r
121 set_offset(m.input_b_im, 0);
\r
122 set_offset(m.twiddle_re, 0);
\r
123 set_offset(m.twiddle_im, 0);
\r
127 * Initializes the addresses for reading the inputs. This function must be
\r
128 * called twice per stage, since halfway the stage the addressing changes.
\r
130 INLINE void init_output_addresses_regular(struct mems m, bool stage_odd, bool second_half) {
\r
132 * For the second half of the stage, the starting addresses are
\r
133 * reversed. write_output_regular above will also swap the output
\r
135 * TODO: Better comments :-)
\r
138 set_base(m.output_a_re, 0);
\r
139 set_base(m.output_a_im, 0);
\r
140 set_base(m.output_b_re, 0);
\r
141 set_base(m.output_b_im, 0);
\r
143 /* We subtract two from every address, since write_output_regular
\r
144 * adds two to the offset before writing the first (and every other)
\r
147 set_offset(m.output_a_re, 1-2);
\r
148 set_offset(m.output_a_im, 1-2);
\r
149 set_offset(m.output_b_re, 0-2);
\r
150 set_offset(m.output_b_im, 0-2);
\r
152 set_offset(m.output_a_re, 0-2);
\r
153 set_offset(m.output_a_im, 0-2);
\r
154 set_offset(m.output_b_re, 1-2);
\r
155 set_offset(m.output_b_im, 1-2);
\r
159 INLINE void do_half_regular_stage(struct mems m, bool stage_odd, bool second_half){
\r
161 * We are doing two cycles in each iteration, so we can alternate the
\r
162 * cycle_odd argument (which only works with constants, I don't expect
\r
163 * the optimizer to do this loop unrolling for us). Since we need to
\r
164 * write outputs before reading, but don't have any outputs to write
\r
165 * in the first cycle, we must put the first cycle outside of the
\r
166 * loop. Since the loop does two cycles at a time, this means there
\r
167 * must be two cycles outside of the loop, so we put one at the end as
\r
168 * well. Additionally, we also need to write the outputs of the last
\r
169 * cycle in an extra cycle at the end. We probably can't combine this
\r
170 * last cycle with the first cycle of the next stage, because they
\r
171 * need the same memories (input becomes output and v.v.).
\r
174 /* Initialize output addresses, this must be done twice per stage */
\r
175 init_output_addresses_regular(m, stage_odd, second_half);
\r
177 /* First cycle (no previous output to write) */
\r
178 struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage_odd);
\r
179 struct bf_out out = butterfly(in);
\r
181 /* Now, do half a single stage. That means N_t / 4 cycles. Since we do 2
\r
182 * cycles on every iteration, plus one before and after the loop,
\r
183 * we will loop N_t / 8 - 1 times. */
\r
184 init_loop(LC2, (PARAM_N_t / 8) - 1);
\r
186 /* Write outputs of previous cycle */
\r
187 write_output_regular(m, out, second_half);
\r
190 in = read_input_regular(m, ODD_CYCLE, second_half);
\r
191 out = butterfly(in);
\r
194 /* Write outputs of previous cycle */
\r
195 write_output_regular(m, out, second_half);
\r
198 in = read_input_regular(m, EVEN_CYCLE, second_half);
\r
199 out = butterfly(in);
\r
200 } while (loop_next(LC2));
\r
202 /* Write outputs of previous cycle */
\r
203 write_output_regular(m, out, second_half);
\r
206 in = read_input_regular(m, ODD_CYCLE, second_half);
\r
207 out = butterfly(in);
\r
210 /* Write outputs of last cycle */
\r
211 write_output_regular(m, out, second_half);
\r
213 /* Force the next cycle, because the next stage must read from
\r
214 * the memory we just wrote to */
\r
218 INLINE struct mems init_mem_mapping(bool stage_odd){
\r
221 res.input_a_re = alloc_mem(P0M1);
\r
222 res.input_a_im = alloc_mem(P1M1);
\r
223 res.input_b_re = alloc_mem(P2M1);
\r
224 res.input_b_im = alloc_mem(P3M1);
\r
225 res.output_a_re = alloc_mem(P0M0);
\r
226 res.output_a_im = alloc_mem(P1M0);
\r
227 res.output_b_re = alloc_mem(P2M0);
\r
228 res.output_b_im = alloc_mem(P3M0);
\r
230 res.input_a_re = alloc_mem(P0M0);
\r
231 res.input_a_im = alloc_mem(P1M0);
\r
232 res.input_b_re = alloc_mem(P2M0);
\r
233 res.input_b_im = alloc_mem(P3M0);
\r
234 res.output_a_re = alloc_mem(P0M1);
\r
235 res.output_a_im = alloc_mem(P1M1);
\r
236 res.output_b_re = alloc_mem(P2M1);
\r
237 res.output_b_im = alloc_mem(P3M1);
\r
240 res.twiddle_re = alloc_mem(P4M0);
\r
241 res.twiddle_im = alloc_mem(P4M1);
\r
246 do { freeze(); } while (gpi(0) == 0);
\r
249 /* We need to do n_t regular stages. Since we do two stages each
\r
250 * iteration, we'll do n_t / 2 iterations. */
\r
251 init_loop(LC1, (PARAM_n_t / 2));
\r
253 m = init_mem_mapping(EVEN_STAGE);
\r
254 init_input_addresses_regular(m, EVEN_STAGE);
\r
255 /* do_half_regular_stage will init output addresses */
\r
257 do_half_regular_stage(m, EVEN_STAGE, FIRST_HALF);
\r
258 do_half_regular_stage(m, EVEN_STAGE, SECOND_HALF);
\r
260 init_input_addresses_regular(m, ODD_STAGE);
\r
261 m = init_mem_mapping(ODD_STAGE);
\r
263 do_half_regular_stage(m, ODD_STAGE, FIRST_HALF);
\r
264 do_half_regular_stage(m, ODD_STAGE, SECOND_HALF);
\r
265 } while (loop_next(LC1));
\r