1 #ifndef __MONTIUMCC__
\r
10 * Executes a single butterfly on ALU 0-3. The inputs are the words taken from
\r
11 * in, which will be read on various inputs of ALU 0-3. Outputs will be
\r
12 * returned and will we be available on the output ports of ALU 0 and 2.
\r
14 INLINE struct bf_out butterfly(struct bf_in in) {
\r
18 aluexp Wixbi = west(fmul(rd1(in.W_im), rb1(in.b_im)));
\r
20 /* re(W * b) = re(W) * re(b) - im(W) * im(b) */
\r
21 aluexp Wxbr = ssub_acc(fmul(rc1(in.W_re), ra1(in.b_re)), Wixbi);
\r
24 /* re(out_a) = re(a) + re(W * b) */
\r
25 out.a_re = p0o0(sadd_bf(rb1(in.a_re), Wxbr));
\r
26 /* re(out_b) = re(a) - re(W * b) */
\r
27 out.b_re = p0o1(ssub_bf(rb1(in.a_re), Wxbr));
\r
31 aluexp Wixbr = west(fmul(rd1(in.W_im), rb1(in.b_re)));
\r
32 /* im(W * b) = re(W) * im(b) + im(W) * re(b) */
\r
33 aluexp Wxbi = sadd_acc(fmul(rc1(in.W_re), ra1(in.b_im)), Wixbr);
\r
35 /* im(out_a) = im(a) + im(W * b) */
\r
36 out.a_im = p2o0(sadd_bf(rb1(in.a_im), Wxbi));
\r
37 /* im(out_b) = im(a) - im(W * b) */
\r
38 out.b_im = p2o1(ssub_bf(rb1(in.a_im), Wxbi));
\r
44 * Writes the output of a butterfly given in res to the correct memory
\r
46 * @param second_half Are we in the second half of the stage?
\r
48 INLINE void write_output_regular(struct mems m, struct bf_out res, bool second_half, enum out_strategy out_s) {
\r
49 if (out_s == REGULAR_OUT) {
\r
50 /* Skip a memory on each cycle during the regular stages */
\r
51 add_offset(m.output_a_re, 2);
\r
52 add_offset(m.output_a_im, 2);
\r
53 add_offset(m.output_b_re, 2);
\r
54 add_offset(m.output_b_im, 2);
\r
56 /* Simply write output linearly */
\r
57 add_offset(m.output_a_re, 1);
\r
58 add_offset(m.output_a_im, 1);
\r
59 add_offset(m.output_b_re, 1);
\r
60 add_offset(m.output_b_im, 1);
\r
62 if (out_s == BITREVERSED_OUT) {
\r
64 Use the memories (which are n_t - 1 bits wide) bitreversed.
\r
65 Since we are generating the samples in sequence (0, 1, 2, 3,
\r
66 ...) but are writing them to two different memories (0, 8,
\r
67 1, 9, ...) The last bit is already bitreversed, so in effect
\r
68 we have fully bitreversed the results. Note that this holds
\r
69 in the non-distributed case (Q = 1), but might also hold in
\r
70 the distributed case (if the tile numbers are bitreversed
\r
71 before concatenating memory).
\r
73 use_bitreverse(m.output_a_re, PARAM_n_t - 1);
\r
74 use_bitreverse(m.output_a_im, PARAM_n_t - 1);
\r
75 use_bitreverse(m.output_b_re, PARAM_n_t - 1);
\r
76 use_bitreverse(m.output_b_im, PARAM_n_t - 1);
\r
79 if (out_s == REGULAR_OUT && second_half) {
\r
80 /* When in the regular stages, reverse memory a and b during
\r
81 * the second half */
\r
82 write_mem(m.output_a_re, res.b_re);
\r
83 write_mem(m.output_a_im, res.b_im);
\r
84 write_mem(m.output_b_re, res.a_re);
\r
85 write_mem(m.output_b_im, res.a_im);
\r
87 /* Simply write a to mem a and b to mem b */
\r
88 write_mem(m.output_a_re, res.a_re);
\r
89 write_mem(m.output_a_im, res.a_im);
\r
90 write_mem(m.output_b_re, res.b_re);
\r
91 write_mem(m.output_b_im, res.b_im);
\r
96 * Reads four inputs and two twiddle factors from memory.
\r
97 * Also increases memory offsets by 1 after reading.
\r
98 * @param stage_odd Is this an odd stage? If so, read from the left
\r
99 * memories, else read from the right memories
\r
100 * (not implemented yet).
\r
101 * @param cycle_odd Is this an odd cycle within the stage? If so,
\r
102 * read input a from memory b and v.v. If not,
\r
103 * simply read a from memory a and b from memory b.
\r
105 INLINE struct bf_in read_input_regular(struct mems m, bool cycle_odd, int stage) {
\r
107 /* Swap memory a and b during the odd cycles */
\r
109 in.a_re = read_mem(m.input_a_re);
\r
110 in.a_im = read_mem(m.input_a_im);
\r
111 in.b_re = read_mem(m.input_b_re);
\r
112 in.b_im = read_mem(m.input_b_im);
\r
114 in.b_re = read_mem(m.input_a_re);
\r
115 in.b_im = read_mem(m.input_a_im);
\r
116 in.a_re = read_mem(m.input_b_re);
\r
117 in.a_im = read_mem(m.input_b_im);
\r
119 in.W_re = read_mem(m.twiddle_re);
\r
120 in.W_im = read_mem(m.twiddle_im);
\r
123 /* Read inputs sequentially */
\r
124 add_offset(m.input_a_re, 1);
\r
125 add_offset(m.input_a_im, 1);
\r
126 add_offset(m.input_b_re, 1);
\r
127 add_offset(m.input_b_im, 1);
\r
129 /* TODO: Is this true? */
\r
130 add_offset(m.twiddle_re, (PARAM_N_t>>stage));
\r
131 add_offset(m.twiddle_im, (PARAM_N_t>>stage));
\r
132 use_mask(m.twiddle_re, (PARAM_N_t/2)-1);
\r
133 use_mask(m.twiddle_im, (PARAM_N_t/2)-1);
\r
139 * Initializes the addresses for reading the inputs and twiddel factors.
\r
140 * Should be called once at the start of each stage.
\r
142 INLINE void init_input_addresses_regular(struct mems m) {
\r
143 /* We simply start reading at address 0 incrementally */
\r
144 set_base(m.input_a_im, 0);
\r
145 set_base(m.input_b_re, 0);
\r
146 set_base(m.input_b_im, 0);
\r
147 set_base(m.twiddle_re, 0);
\r
148 set_base(m.twiddle_im, 0);
\r
150 set_offset(m.input_a_re, 0);
\r
151 set_offset(m.input_a_im, 0);
\r
152 set_offset(m.input_b_re, 0);
\r
153 set_offset(m.input_b_im, 0);
\r
154 set_offset(m.twiddle_re, 0);
\r
155 set_offset(m.twiddle_im, 0);
\r
159 * Initializes the addresses for reading the inputs. This function must be
\r
160 * called twice per stage, since halfway the stage the addressing changes.
\r
162 INLINE void init_output_addresses_regular(struct mems m, bool second_half, enum out_strategy out_s) {
\r
163 /* Only reset the memory addresses for the second half for the regular
\r
165 if (out_s != REGULAR_OUT && second_half)
\r
168 * For the second half of the stage, the starting addresses are
\r
169 * reversed. write_output_regular above will also swap the output
\r
171 * TODO: Better comments :-)
\r
173 set_base(m.output_a_re, 0);
\r
174 set_base(m.output_a_im, 0);
\r
175 set_base(m.output_b_re, 0);
\r
176 set_base(m.output_b_im, 0);
\r
177 if (out_s == REGULAR_OUT) {
\r
178 /* We subtract two from every address, since write_output_regular
\r
179 * adds two to the offset before writing the first (and every other)
\r
182 set_offset(m.output_a_re, 1-2);
\r
183 set_offset(m.output_a_im, 1-2);
\r
184 set_offset(m.output_b_re, 0-2);
\r
185 set_offset(m.output_b_im, 0-2);
\r
187 set_offset(m.output_a_re, 0-2);
\r
188 set_offset(m.output_a_im, 0-2);
\r
189 set_offset(m.output_b_re, 1-2);
\r
190 set_offset(m.output_b_im, 1-2);
\r
193 /* Write sequentially, starting at 0 for both memories */
\r
194 set_offset(m.output_a_re, 0-1);
\r
195 set_offset(m.output_a_im, 0-1);
\r
196 set_offset(m.output_b_re, 0-1);
\r
197 set_offset(m.output_b_im, 0-1);
\r
201 INLINE void do_half_regular_stage(struct mems m, int stage, bool second_half, enum in_strategy in_s, enum out_strategy out_s){
\r
203 * We are doing two cycles in each iteration, so we can alternate the
\r
204 * cycle_odd argument (which only works with constants, I don't expect
\r
205 * the optimizer to do this loop unrolling for us). Since we need to
\r
206 * write outputs before reading, but don't have any outputs to write
\r
207 * in the first cycle, we must put the first cycle outside of the
\r
208 * loop. Since the loop does two cycles at a time, this means there
\r
209 * must be two cycles outside of the loop, so we put one at the end as
\r
210 * well. Additionally, we also need to write the outputs of the last
\r
211 * cycle in an extra cycle at the end. We probably can't combine this
\r
212 * last cycle with the first cycle of the next stage, because they
\r
213 * need the same memories (input becomes output and v.v.).
\r
216 /* Initialize output addresses, this must be done twice per stage */
\r
217 init_output_addresses_regular(m, second_half, out_s);
\r
219 /* First cycle (no previous output to write) */
\r
220 struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage);
\r
221 struct bf_out out = butterfly(in);
\r
223 /* Now, do half a single stage. That means N_t / 4 cycles. Since we do 2
\r
224 * cycles on every iteration, plus one before and after the loop,
\r
225 * we will loop N_t / 8 - 1 times. We add an extra - 1 because this is a do while loop... */
\r
226 init_loop(LC2, (PARAM_N_t / 8) - 1 - 1);
\r
228 /* Write outputs of previous cycle */
\r
229 write_output_regular(m, out, second_half, out_s);
\r
232 in = read_input_regular(m, ODD_CYCLE, stage);
\r
233 out = butterfly(in);
\r
236 /* Write outputs of previous cycle */
\r
237 write_output_regular(m, out, second_half, out_s);
\r
240 in = read_input_regular(m, EVEN_CYCLE, stage);
\r
241 out = butterfly(in);
\r
242 } while (loop_next(LC2));
\r
244 /* Write outputs of previous cycle */
\r
245 write_output_regular(m, out, second_half, out_s);
\r
248 in = read_input_regular(m, ODD_CYCLE, stage);
\r
249 out = butterfly(in);
\r
252 /* Write outputs of last cycle */
\r
253 write_output_regular(m, out, second_half, out_s);
\r
255 /* Force the next cycle, because the next stage must read from
\r
256 * the memory we just wrote to */
\r
261 * Assign the input and output memories, based on the current stage. Also
\r
262 * assigns the twiddle memories, but those are fixed.
\r
264 INLINE struct mems init_mem_mapping(int stage){
\r
266 /* Use left memories for input on odd (ie, first)
\r
267 * stages and right memories on even stages. */
\r
268 if ((stage % 2) == 0) {
\r
269 res.input_a_re = alloc_mem(P0M1);
\r
270 res.input_a_im = alloc_mem(P1M1);
\r
271 res.input_b_re = alloc_mem(P2M1);
\r
272 res.input_b_im = alloc_mem(P3M1);
\r
273 res.output_a_re = alloc_mem(P0M0);
\r
274 res.output_a_im = alloc_mem(P1M0);
\r
275 res.output_b_re = alloc_mem(P2M0);
\r
276 res.output_b_im = alloc_mem(P3M0);
\r
278 res.input_a_re = alloc_mem(P0M0);
\r
279 res.input_a_im = alloc_mem(P1M0);
\r
280 res.input_b_re = alloc_mem(P2M0);
\r
281 res.input_b_im = alloc_mem(P3M0);
\r
282 res.output_a_re = alloc_mem(P0M1);
\r
283 res.output_a_im = alloc_mem(P1M1);
\r
284 res.output_b_re = alloc_mem(P2M1);
\r
285 res.output_b_im = alloc_mem(P3M1);
\r
288 res.twiddle_re = alloc_mem(P4M0);
\r
289 res.twiddle_im = alloc_mem(P4M1);
\r
294 INLINE void do_regular_stage(int stage, enum in_strategy in_s, enum out_strategy out_s)
\r
296 struct mems m = init_mem_mapping(stage);
\r
297 init_input_addresses_regular(m);
\r
298 /* do_half_regular_stage will init output addresses */
\r
300 do_half_regular_stage(m, stage, FIRST_HALF, in_s, out_s);
\r
301 do_half_regular_stage(m, stage, SECOND_HALF, in_s, out_s);
\r
305 do { freeze(); } while (gpi(0) == 0);
\r
307 do_regular_stage(1, REGULAR_IN, REGULAR_OUT);
\r
308 do_regular_stage(2, REGULAR_IN, REGULAR_OUT);
\r
309 do_regular_stage(3, REGULAR_IN, REGULAR_OUT);
\r
310 do_regular_stage(4, REGULAR_IN, BITREVERSED_OUT);
\r