diff --git a/src/platforms/common/swdptap.c b/src/platforms/common/swdptap.c index 1b113e3e828..1f171f83dd0 100644 --- a/src/platforms/common/swdptap.c +++ b/src/platforms/common/swdptap.c @@ -46,6 +46,18 @@ static bool swdptap_seq_in_parity(uint32_t *ret, size_t clock_cycles) __attribut static void swdptap_seq_out(uint32_t tms_states, size_t clock_cycles) __attribute__((optimize(3))); static void swdptap_seq_out_parity(uint32_t tms_states, size_t clock_cycles) __attribute__((optimize(3))); +/* + * Overall strategy for timing consistency: + * + * - Each primitive ends with a falling clock edge + * - Output is driven after the falling clock edge + * - Input is read immediately before the rising clock edge + * - Each primitive assumes it was immediately preceded by a falling clock edge + * + * This increases the chances of meeting setup and hold times when the target + * connection is lower bandwidth (with adequately slower clocks configured). + */ + void swdptap_init(void) { swd_proc.seq_in = swdptap_seq_in; @@ -68,9 +80,7 @@ static void swdptap_turnaround(const swdio_status_t dir) if (dir == SWDIO_STATUS_FLOAT) { SWDIO_MODE_FLOAT(); - } else - gpio_clear(SWCLK_PORT, SWCLK_PIN); - + } for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) continue; @@ -78,8 +88,8 @@ static void swdptap_turnaround(const swdio_status_t dir) for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) continue; + gpio_clear(SWCLK_PORT, SWCLK_PIN); if (dir == SWDIO_STATUS_DRIVE) { - gpio_clear(SWCLK_PORT, SWCLK_PIN); SWDIO_MODE_DRIVE(); } } @@ -89,16 +99,29 @@ static uint32_t swdptap_seq_in_clk_delay(size_t clock_cycles) __attribute__((opt static uint32_t swdptap_seq_in_clk_delay(const size_t clock_cycles) { uint32_t value = 0; - for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { - gpio_clear(SWCLK_PORT, SWCLK_PIN); - value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; + if (!clock_cycles) + return 0; + /* + * Count down instead of up, because with an up-count, some ARM-GCC + * versions use an explicit CMP, missing the optimization of converting + * to a faster down-count that uses SUBS followed by BCS/BCC. + */ + for (size_t cycle = clock_cycles; cycle--;) { for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; + const bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); gpio_set(SWCLK_PORT, SWCLK_PIN); for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; + value >>= 1U; + value |= (uint32_t)bit << 31U; + /* Reordering barrier */ + __asm__("" ::: "memory"); + gpio_clear(SWCLK_PORT, SWCLK_PIN); + /* Reordering barrier */ + __asm__("" ::: "memory"); } - gpio_clear(SWCLK_PORT, SWCLK_PIN); + value >>= (32U - clock_cycles); return value; } @@ -106,14 +129,29 @@ static uint32_t swdptap_seq_in_no_delay(size_t clock_cycles) __attribute__((opti static uint32_t swdptap_seq_in_no_delay(const size_t clock_cycles) { + if (!clock_cycles) + return 0; uint32_t value = 0; - for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { - gpio_clear(SWCLK_PORT, SWCLK_PIN); - value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; + /* + * Count down instead of up, because with an up-count, some ARM-GCC + * versions use an explicit CMP, missing the optimization of converting + * to a faster down-count that uses SUBS followed by BCS/BCC. + */ + for (size_t cycle = clock_cycles; cycle--;) { + /* Reordering barrier */ + __asm__("" ::: "memory"); + bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); gpio_set(SWCLK_PORT, SWCLK_PIN); - __asm__("nop"); + __asm__("nop" ::: "memory"); + value >>= 1U; + value |= (uint32_t)bit << 31U; + /* Reordering barrier */ + __asm__("" ::: "memory"); + gpio_clear(SWCLK_PORT, SWCLK_PIN); + /* Reordering barrier */ + __asm__("" ::: "memory"); } - gpio_clear(SWCLK_PORT, SWCLK_PIN); + value >>= (32U - clock_cycles); return value; } @@ -132,16 +170,18 @@ static bool swdptap_seq_in_parity(uint32_t *ret, size_t clock_cycles) for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) continue; - const bool parity = calculate_odd_parity(result); const bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); gpio_set(SWCLK_PORT, SWCLK_PIN); for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) continue; - *ret = result; + gpio_clear(SWCLK_PORT, SWCLK_PIN); /* Terminate the read cycle now */ swdptap_turnaround(SWDIO_STATUS_DRIVE); + + const bool parity = calculate_odd_parity(result); + *ret = result; return parity == bit; } @@ -149,26 +189,57 @@ static void swdptap_seq_out_clk_delay(uint32_t tms_states, size_t clock_cycles) static void swdptap_seq_out_clk_delay(const uint32_t tms_states, const size_t clock_cycles) { - for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { - gpio_clear(SWCLK_PORT, SWCLK_PIN); - gpio_set_val(SWDIO_PORT, SWDIO_PIN, tms_states & (1U << cycle)); + uint32_t value = tms_states; + bool bit = value & 1U; + if (!clock_cycles) + return; + /* + * Count down instead of up, because with an up-count, some ARM-GCC + * versions use an explicit CMP, missing the optimization of converting + * to a faster down-count that uses SUBS followed by BCS/BCC. + */ + for (size_t cycle = clock_cycles; cycle--;) { + /* Reordering barrier */ + __asm__("" ::: "memory"); + gpio_set_val(SWDIO_PORT, SWDIO_PIN, bit); for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; gpio_set(SWCLK_PORT, SWCLK_PIN); for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; + __asm__("nop" ::: "memory"); + value >>= 1U; + bit = value & 1U; + /* Reordering barrier */ + __asm__("" ::: "memory"); + gpio_clear(SWCLK_PORT, SWCLK_PIN); } - gpio_clear(SWCLK_PORT, SWCLK_PIN); } static void swdptap_seq_out_no_delay(uint32_t tms_states, size_t clock_cycles) __attribute__((optimize(3))); static void swdptap_seq_out_no_delay(const uint32_t tms_states, const size_t clock_cycles) { - for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { + uint32_t value = tms_states; + bool bit = value & 1U; + if (!clock_cycles) + return; + /* + * Count down instead of up, because with an up-count, some ARM-GCC + * versions use an explicit CMP, missing the optimization of converting + * to a faster down-count that uses SUBS followed by BCS/BCC. + */ + for (size_t cycle = clock_cycles; cycle--;) { + /* Reordering barrier */ + __asm__("" ::: "memory"); gpio_clear(SWCLK_PORT, SWCLK_PIN); - gpio_set_val(SWDIO_PORT, SWDIO_PIN, tms_states & (1U << cycle)); + gpio_set_val(SWDIO_PORT, SWDIO_PIN, bit); gpio_set(SWCLK_PORT, SWCLK_PIN); + __asm__("nop" ::: "memory"); + value >>= 1U; + bit = value & 1U; + /* Reordering barrier */ + __asm__("" ::: "memory"); } gpio_clear(SWCLK_PORT, SWCLK_PIN); }