1 #ifndef __MONTIUMCC__
\r
10 * Executes a single butterfly on ALU 0-3. The inputs are the words taken from
\r
11 * in, which will be read on various inputs of ALU 0-3. Outputs will be
\r
12 * returned and will we be available on the output ports of ALU 0 and 2.
\r
14 INLINE struct bf_out butterfly(struct bf_in in) {
\r
18 aluexp Wixbi = west(fmul(rd1(in.W_im), rb1(in.b_im)));
\r
19 /* re(W * b) = re(W) * re(b) - im(W) * im(b) */
\r
20 aluexp Wxbr = ssub_acc(fmul(rc1(in.W_re), ra1(in.b_re)), Wixbi);
\r
22 /* re(out_a) = re(a) + re(W * b) */
\r
23 out.a_re = p0o0(sadd_bf(rb1(in.a_re), Wxbr));
\r
24 /* re(out_b) = re(a) - re(W * b) */
\r
25 out.b_re = p0o1(ssub_bf(rb1(in.a_re), Wxbr));
\r
29 aluexp Wixbr = west(fmul(rd1(in.W_im), rb1(in.b_re)));
\r
30 /* im(W * b) = re(W) * im(b) + im(W) * re(b) */
\r
31 aluexp Wxbi = sadd_acc(fmul(rc1(in.W_re), ra1(in.b_im)), Wixbr);
\r
33 /* im(out_a) = im(a) + im(W * b) */
\r
34 out.a_im = p2o0(sadd_bf(rb1(in.a_im), Wxbi));
\r
35 /* im(out_b) = im(a) - im(W * b) */
\r
36 out.b_im = p2o1(ssub_bf(rb1(in.a_im), Wxbi));
\r
42 * Writes the output of a butterfly given in res to the correct memory
\r
44 * @param second_half Are we in the second half of the stage?
\r
46 INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_half) {
\r
47 add_offset(m.output_a_re, 2);
\r
48 add_offset(m.output_a_im, 2);
\r
49 add_offset(m.output_b_re, 2);
\r
50 add_offset(m.output_b_im, 2);
\r
53 write_mem(m.output_a_re, res.a_re);
\r
54 write_mem(m.output_a_im, res.a_im);
\r
55 write_mem(m.output_b_re, res.b_re);
\r
56 write_mem(m.output_b_im, res.b_im);
\r
58 /* Write a results to memory b and v.v. */
\r
59 write_mem(m.output_a_re, res.b_re);
\r
60 write_mem(m.output_a_im, res.b_im);
\r
61 write_mem(m.output_b_re, res.a_re);
\r
62 write_mem(m.output_b_im, res.a_im);
\r
67 * Reads four inputs and two twiddle factors from memory.
\r
68 * Also increases memory offsets by 1 after reading.
\r
69 * @param stage_odd Is this an odd stage? If so, read from the left
\r
70 * memories, else read from the right memories
\r
71 * (not implemented yet).
\r
72 * @param cycle_odd Is this an odd cycle within the stage? If so,
\r
73 * read input a from memory b and v.v. If not,
\r
74 * simply read a from memory a and b from memory b.
\r
76 INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, bool stage_odd) {
\r
78 /* Swap memory a and b during the odd cycles */
\r
80 in.a_re = read_mem(m.input_a_re);
\r
81 in.a_im = read_mem(m.input_a_im);
\r
82 in.b_re = read_mem(m.input_b_re);
\r
83 in.b_im = read_mem(m.input_b_im);
\r
85 in.b_re = read_mem(m.input_a_re);
\r
86 in.b_im = read_mem(m.input_a_im);
\r
87 in.a_re = read_mem(m.input_b_re);
\r
88 in.a_im = read_mem(m.input_b_im);
\r
90 in.W_re = read_mem(m.twiddle_re);
\r
91 in.W_im = read_mem(m.twiddle_im);
\r
94 /* Read inputs sequentially */
\r
95 add_offset(m.input_a_re, 1);
\r
96 add_offset(m.input_a_im, 1);
\r
97 add_offset(m.input_b_re, 1);
\r
98 add_offset(m.input_b_im, 1);
\r
99 /* TODO: Update twiddle offsets */
\r
104 * Initializes the addresses for writing the outputs.
\r
105 * @param stage_odd True if this is an odd stage.
\r
106 * @param second_half True if we are initing halfway a stage.
\r
108 INLINE void init_input_addresses_regular(struct mems m, bool stage_odd) {
\r
109 /* We simply start reading at address 0 incrementally */
\r
110 set_base(m.input_a_im, 0);
\r
111 set_base(m.input_b_re, 0);
\r
112 set_base(m.input_b_im, 0);
\r
113 set_base(m.twiddle_re, 0);
\r
114 set_base(m.twiddle_im, 0);
\r
116 set_offset(m.input_a_re, 0);
\r
117 set_offset(m.input_a_im, 0);
\r
118 set_offset(m.input_b_re, 0);
\r
119 set_offset(m.input_b_im, 0);
\r
120 set_offset(m.twiddle_re, 0);
\r
121 set_offset(m.twiddle_im, 0);
\r
125 * Initializes the addresses for reading the inputs. This function must be
\r
126 * called twice per stage, since halfway the stage the addressing changes.
\r
128 INLINE void init_output_addresses_regular(struct mems m, bool stage_odd, bool second_half) {
\r
130 * For the second half of the stage, the starting addresses are
\r
131 * reversed. write_output_regular above will also swap the output
\r
133 * TODO: Better comments :-)
\r
136 set_base(m.output_a_re, 0);
\r
137 set_base(m.output_a_im, 0);
\r
138 set_base(m.output_b_re, 0);
\r
139 set_base(m.output_b_im, 0);
\r
141 /* We subtract two from every address, since write_output_regular
\r
142 * adds two to the offset before writing the first (and every other)
\r
145 set_offset(m.output_a_re, 1-2);
\r
146 set_offset(m.output_a_im, 1-2);
\r
147 set_offset(m.output_b_re, 0-2);
\r
148 set_offset(m.output_b_im, 0-2);
\r
150 set_offset(m.output_a_re, 1-2);
\r
151 set_offset(m.output_a_im, 1-2);
\r
152 set_offset(m.output_b_re, 0-2);
\r
153 set_offset(m.output_b_im, 0-2);
\r
157 INLINE void do_half_regular_stage(struct mems m, bool stage_odd, bool second_half){
\r
159 * We are doing two cycles in each iteration, so we can alternate the
\r
160 * cycle_odd argument (which only works with constants, I don't expect
\r
161 * the optimizer to do this loop unrolling for us). Since we need to
\r
162 * write outputs before reading, but don't have any outputs to write
\r
163 * in the first cycle, we must put the first cycle outside of the
\r
164 * loop. Since the loop does two cycles at a time, this means there
\r
165 * must be two cycles outside of the loop, so we put one at the end as
\r
166 * well. Additionally, we also need to write the outputs of the last
\r
167 * cycle in an extra cycle at the end. We probably can't combine this
\r
168 * last cycle with the first cycle of the next stage, because they
\r
169 * need the same memories (input becomes output and v.v.).
\r
172 /* Initialize output addresses, this must be done twice per stage */
\r
173 init_output_addresses_regular(m, stage_odd, second_half);
\r
175 /* First cycle (no previous output to write) */
\r
176 struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage_odd);
\r
177 struct bf_out out = butterfly(in);
\r
179 /* Now, do a single stage. That means N_t / 2 cycles. Since we do 2
\r
180 * cycles on every iteration, plus one before and after the loop,
\r
181 * we will loop N_t / 4 - 1 times. */
\r
182 init_loop(LC2, (PARAM_N_t / 4) - 1);
\r
184 /* Write outputs of previous cycle */
\r
185 write_output_regular(m, out, second_half);
\r
188 in = read_input_regular(m, ODD_CYCLE, second_half);
\r
189 out = butterfly(in);
\r
192 /* Write outputs of previous cycle */
\r
193 write_output_regular(m, out, second_half);
\r
196 in = read_input_regular(m, EVEN_CYCLE, second_half);
\r
197 out = butterfly(in);
\r
198 } while (loop_next(LC2));
\r
200 /* Write outputs of previous cycle */
\r
201 write_output_regular(m, out, second_half);
\r
204 in = read_input_regular(m, ODD_CYCLE, second_half);
\r
205 out = butterfly(in);
\r
208 /* Write outputs of last cycle */
\r
209 write_output_regular(m, out, second_half);
\r
211 /* Force the next cycle, because the next stage must read from
\r
212 * the memory we just wrote to */
\r
216 INLINE struct mems init_mem_mapping(bool stage_odd){
\r
219 res.input_a_re = alloc_mem(P0M1);
\r
220 res.input_a_im = alloc_mem(P1M1);
\r
221 res.input_b_re = alloc_mem(P2M1);
\r
222 res.input_b_im = alloc_mem(P3M1);
\r
223 res.output_a_re = alloc_mem(P0M0);
\r
224 res.output_a_im = alloc_mem(P1M0);
\r
225 res.output_b_re = alloc_mem(P2M0);
\r
226 res.output_b_im = alloc_mem(P3M0);
\r
228 res.input_a_re = alloc_mem(P0M0);
\r
229 res.input_a_im = alloc_mem(P1M0);
\r
230 res.input_b_re = alloc_mem(P2M0);
\r
231 res.input_b_im = alloc_mem(P3M0);
\r
232 res.output_a_re = alloc_mem(P0M1);
\r
233 res.output_a_im = alloc_mem(P1M1);
\r
234 res.output_b_re = alloc_mem(P2M1);
\r
235 res.output_b_im = alloc_mem(P3M1);
\r
238 res.twiddle_re = alloc_mem(P4M0);
\r
239 res.twiddle_im = alloc_mem(P4M1);
\r
244 do { freeze(); } while (gpi(0) == 0);
\r
247 m = init_mem_mapping(EVEN_STAGE);
\r
248 init_input_addresses_regular(m, EVEN_STAGE);
\r
249 /* do_half_regular_stage will init output addresses */
\r
251 do_half_regular_stage(m, EVEN_STAGE, FIRST_HALF);
\r
252 do_half_regular_stage(m, EVEN_STAGE, SECOND_HALF);
\r
254 init_input_addresses_regular(m, ODD_STAGE);
\r
255 m = init_mem_mapping(ODD_STAGE);
\r
257 do_half_regular_stage(m, ODD_STAGE, FIRST_HALF);
\r
258 do_half_regular_stage(m, ODD_STAGE, SECOND_HALF);
\r