1 #ifndef __MONTIUMCC__
\r
10 * Executes a single butterfly on ALU 0-3. The inputs are the words taken from
\r
11 * in, which will be read on various inputs of ALU 0-3. Outputs will be
\r
12 * returned and will we be available on the output ports of ALU 0 and 2.
\r
14 INLINE struct bf_out butterfly(struct bf_in in) {
\r
18 aluexp Wixbi = west(fmul(rd1(in.W_im), rb1(in.b_im)));
\r
20 /* re(W * b) = re(W) * re(b) - im(W) * im(b) */
\r
21 aluexp Wxbr = ssub_acc(fmul(rc1(in.W_re), ra1(in.b_re)), Wixbi);
\r
24 /* re(out_a) = re(a) + re(W * b) */
\r
25 out.a_re = p0o0(sadd_bf(rb1(in.a_re), Wxbr));
\r
26 /* re(out_b) = re(a) - re(W * b) */
\r
27 out.b_re = p0o1(ssub_bf(rb1(in.a_re), Wxbr));
\r
31 aluexp Wixbr = west(fmul(rd1(in.W_im), rb1(in.b_re)));
\r
32 /* im(W * b) = re(W) * im(b) + im(W) * re(b) */
\r
33 aluexp Wxbi = sadd_acc(fmul(rc1(in.W_re), ra1(in.b_im)), Wixbr);
\r
35 /* im(out_a) = im(a) + im(W * b) */
\r
36 out.a_im = p2o0(sadd_bf(rb1(in.a_im), Wxbi));
\r
37 /* im(out_b) = im(a) - im(W * b) */
\r
38 out.b_im = p2o1(ssub_bf(rb1(in.a_im), Wxbi));
\r
44 * Writes the output of a butterfly given in res to the correct memory
\r
46 * @param second_half Are we in the second half of the stage?
\r
48 INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_half) {
\r
49 add_offset(m.output_a_re, 2);
\r
50 add_offset(m.output_a_im, 2);
\r
51 add_offset(m.output_b_re, 2);
\r
52 add_offset(m.output_b_im, 2);
\r
55 write_mem(m.output_a_re, res.a_re);
\r
56 write_mem(m.output_a_im, res.a_im);
\r
57 write_mem(m.output_b_re, res.b_re);
\r
58 write_mem(m.output_b_im, res.b_im);
\r
60 /* Write a results to memory b and v.v. */
\r
61 write_mem(m.output_a_re, res.b_re);
\r
62 write_mem(m.output_a_im, res.b_im);
\r
63 write_mem(m.output_b_re, res.a_re);
\r
64 write_mem(m.output_b_im, res.a_im);
\r
69 * Reads four inputs and two twiddle factors from memory.
\r
70 * Also increases memory offsets by 1 after reading.
\r
71 * @param stage_odd Is this an odd stage? If so, read from the left
\r
72 * memories, else read from the right memories
\r
73 * (not implemented yet).
\r
74 * @param cycle_odd Is this an odd cycle within the stage? If so,
\r
75 * read input a from memory b and v.v. If not,
\r
76 * simply read a from memory a and b from memory b.
\r
78 INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, int stage) {
\r
80 /* Swap memory a and b during the odd cycles */
\r
82 in.a_re = read_mem(m.input_a_re);
\r
83 in.a_im = read_mem(m.input_a_im);
\r
84 in.b_re = read_mem(m.input_b_re);
\r
85 in.b_im = read_mem(m.input_b_im);
\r
87 in.b_re = read_mem(m.input_a_re);
\r
88 in.b_im = read_mem(m.input_a_im);
\r
89 in.a_re = read_mem(m.input_b_re);
\r
90 in.a_im = read_mem(m.input_b_im);
\r
92 in.W_re = read_mem(m.twiddle_re);
\r
93 in.W_im = read_mem(m.twiddle_im);
\r
96 /* Read inputs sequentially */
\r
97 add_offset(m.input_a_re, 1);
\r
98 add_offset(m.input_a_im, 1);
\r
99 add_offset(m.input_b_re, 1);
\r
100 add_offset(m.input_b_im, 1);
\r
102 /* TODO: Is this true? */
\r
103 add_offset(m.twiddle_re, (PARAM_N_t>>stage));
\r
104 add_offset(m.twiddle_im, (PARAM_N_t>>stage));
\r
105 use_mask(m.twiddle_re, (PARAM_N_t/2)-1);
\r
106 use_mask(m.twiddle_im, (PARAM_N_t/2)-1);
\r
112 * Initializes the addresses for reading the inputs and twiddel factors.
\r
113 * Should be called once at the start of each stage.
\r
115 INLINE void init_input_addresses_regular(struct mems m) {
\r
116 /* We simply start reading at address 0 incrementally */
\r
117 set_base(m.input_a_im, 0);
\r
118 set_base(m.input_b_re, 0);
\r
119 set_base(m.input_b_im, 0);
\r
120 set_base(m.twiddle_re, 0);
\r
121 set_base(m.twiddle_im, 0);
\r
123 set_offset(m.input_a_re, 0);
\r
124 set_offset(m.input_a_im, 0);
\r
125 set_offset(m.input_b_re, 0);
\r
126 set_offset(m.input_b_im, 0);
\r
127 set_offset(m.twiddle_re, 0);
\r
128 set_offset(m.twiddle_im, 0);
\r
132 * Initializes the addresses for reading the inputs. This function must be
\r
133 * called twice per stage, since halfway the stage the addressing changes.
\r
135 INLINE void init_output_addresses_regular(struct mems m, bool second_half) {
\r
137 * For the second half of the stage, the starting addresses are
\r
138 * reversed. write_output_regular above will also swap the output
\r
140 * TODO: Better comments :-)
\r
143 set_base(m.output_a_re, 0);
\r
144 set_base(m.output_a_im, 0);
\r
145 set_base(m.output_b_re, 0);
\r
146 set_base(m.output_b_im, 0);
\r
148 /* We subtract two from every address, since write_output_regular
\r
149 * adds two to the offset before writing the first (and every other)
\r
152 set_offset(m.output_a_re, 1-2);
\r
153 set_offset(m.output_a_im, 1-2);
\r
154 set_offset(m.output_b_re, 0-2);
\r
155 set_offset(m.output_b_im, 0-2);
\r
157 set_offset(m.output_a_re, 0-2);
\r
158 set_offset(m.output_a_im, 0-2);
\r
159 set_offset(m.output_b_re, 1-2);
\r
160 set_offset(m.output_b_im, 1-2);
\r
164 INLINE void do_half_regular_stage(struct mems m, int stage, bool second_half){
\r
166 * We are doing two cycles in each iteration, so we can alternate the
\r
167 * cycle_odd argument (which only works with constants, I don't expect
\r
168 * the optimizer to do this loop unrolling for us). Since we need to
\r
169 * write outputs before reading, but don't have any outputs to write
\r
170 * in the first cycle, we must put the first cycle outside of the
\r
171 * loop. Since the loop does two cycles at a time, this means there
\r
172 * must be two cycles outside of the loop, so we put one at the end as
\r
173 * well. Additionally, we also need to write the outputs of the last
\r
174 * cycle in an extra cycle at the end. We probably can't combine this
\r
175 * last cycle with the first cycle of the next stage, because they
\r
176 * need the same memories (input becomes output and v.v.).
\r
179 /* Initialize output addresses, this must be done twice per stage */
\r
180 init_output_addresses_regular(m, second_half);
\r
182 /* First cycle (no previous output to write) */
\r
183 struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage);
\r
184 struct bf_out out = butterfly(in);
\r
186 /* Now, do half a single stage. That means N_t / 4 cycles. Since we do 2
\r
187 * cycles on every iteration, plus one before and after the loop,
\r
188 * we will loop N_t / 8 - 1 times. We add an extra - 1 because this is a do while loop... */
\r
189 init_loop(LC2, (PARAM_N_t / 8) - 1 - 1);
\r
191 /* Write outputs of previous cycle */
\r
192 write_output_regular(m, out, second_half);
\r
195 in = read_input_regular(m, ODD_CYCLE, stage);
\r
196 out = butterfly(in);
\r
199 /* Write outputs of previous cycle */
\r
200 write_output_regular(m, out, second_half);
\r
203 in = read_input_regular(m, EVEN_CYCLE, stage);
\r
204 out = butterfly(in);
\r
205 } while (loop_next(LC2));
\r
207 /* Write outputs of previous cycle */
\r
208 write_output_regular(m, out, second_half);
\r
211 in = read_input_regular(m, ODD_CYCLE, stage);
\r
212 out = butterfly(in);
\r
215 /* Write outputs of last cycle */
\r
216 write_output_regular(m, out, second_half);
\r
218 /* Force the next cycle, because the next stage must read from
\r
219 * the memory we just wrote to */
\r
224 * Assign the input and output memories, based on the current stage. Also
\r
225 * assigns the twiddle memories, but those are fixed.
\r
227 INLINE struct mems init_mem_mapping(int stage){
\r
229 /* Use left memories for input on odd (ie, first)
\r
230 * stages and right memories on even stages. */
\r
231 if ((stage % 2) == 0) {
\r
232 res.input_a_re = alloc_mem(P0M1);
\r
233 res.input_a_im = alloc_mem(P1M1);
\r
234 res.input_b_re = alloc_mem(P2M1);
\r
235 res.input_b_im = alloc_mem(P3M1);
\r
236 res.output_a_re = alloc_mem(P0M0);
\r
237 res.output_a_im = alloc_mem(P1M0);
\r
238 res.output_b_re = alloc_mem(P2M0);
\r
239 res.output_b_im = alloc_mem(P3M0);
\r
241 res.input_a_re = alloc_mem(P0M0);
\r
242 res.input_a_im = alloc_mem(P1M0);
\r
243 res.input_b_re = alloc_mem(P2M0);
\r
244 res.input_b_im = alloc_mem(P3M0);
\r
245 res.output_a_re = alloc_mem(P0M1);
\r
246 res.output_a_im = alloc_mem(P1M1);
\r
247 res.output_b_re = alloc_mem(P2M1);
\r
248 res.output_b_im = alloc_mem(P3M1);
\r
251 res.twiddle_re = alloc_mem(P4M0);
\r
252 res.twiddle_im = alloc_mem(P4M1);
\r
257 INLINE void do_regular_stage(int stage)
\r
259 struct mems m = init_mem_mapping(stage);
\r
260 init_input_addresses_regular(m);
\r
261 /* do_half_regular_stage will init output addresses */
\r
263 do_half_regular_stage(m, stage, FIRST_HALF);
\r
264 do_half_regular_stage(m, stage, SECOND_HALF);
\r
267 do { freeze(); } while (gpi(0) == 0);
\r
269 do_regular_stage(1);
\r
270 do_regular_stage(2);
\r
271 do_regular_stage(3);
\r
272 do_regular_stage(4);
\r