-INLINE void do_half_regular_stage(struct mems m, bool stage_odd, bool second_half){\r
- struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage_odd);\r
+INLINE void do_half_regular_stage(struct mems m, int stage, bool second_half){\r
+ /*\r
+ * We are doing two cycles in each iteration, so we can alternate the\r
+ * cycle_odd argument (which only works with constants, I don't expect\r
+ * the optimizer to do this loop unrolling for us). Since we need to\r
+ * write outputs before reading, but don't have any outputs to write\r
+ * in the first cycle, we must put the first cycle outside of the\r
+ * loop. Since the loop does two cycles at a time, this means there\r
+ * must be two cycles outside of the loop, so we put one at the end as\r
+ * well. Additionally, we also need to write the outputs of the last\r
+ * cycle in an extra cycle at the end. We probably can't combine this\r
+ * last cycle with the first cycle of the next stage, because they\r
+ * need the same memories (input becomes output and v.v.).\r
+ */\r
+\r
+ /* Initialize output addresses, this must be done twice per stage */\r
+ init_output_addresses_regular(m, second_half);\r
+\r
+ /* First cycle (no previous output to write) */\r
+ struct bf_in in = read_input_regular(m, EVEN_CYCLE, stage);\r