From dd71ae10b9ca72ca786353c59756d7a84a69125c Mon Sep 17 00:00:00 2001 From: Scott Shawcroft Date: Tue, 6 Jun 2023 16:20:47 -0700 Subject: [PATCH] Slim down stack frames This reduces the stack frame size of mp_builtin___import__ by limiting the support path length of files from 256 to 96. This function can be called recursively for nested imports so it adds up. Also reduce mp_execute_bytecode (vm.c) from 206 a bc call to 124. This too is recursive and adds up. It is reduced by preventing some inlining. It may decrease performance slightly when importing and unpacking. Adds two new scripts for debugging. One is used from gdb to print frame sizes in a backtrace. The other prints what pcs use a particular stack offset. This helps find infrequently used stack space. Fixes #8053. --- py/circuitpy_mpconfig.h | 4 +-- py/runtime.c | 10 +++---- tools/gdb-stack-size.py | 64 ++++++++++++++++++++++++++++++++++++++++ tools/stack-loc-to-pc.py | 28 ++++++++++++++++++ 4 files changed, 99 insertions(+), 7 deletions(-) create mode 100644 tools/gdb-stack-size.py create mode 100644 tools/stack-loc-to-pc.py diff --git a/py/circuitpy_mpconfig.h b/py/circuitpy_mpconfig.h index cc7886cb3765..e1909a064f54 100644 --- a/py/circuitpy_mpconfig.h +++ b/py/circuitpy_mpconfig.h @@ -59,8 +59,8 @@ extern void common_hal_mcu_enable_interrupts(void); // // default is 128; consider raising to reduce fragmentation. #define MICROPY_ALLOC_PARSE_CHUNK_INIT (16) -// default is 512. -#define MICROPY_ALLOC_PATH_MAX (256) +// default is 512. Longest path in .py bundle as of June 6th, 2023 is 73 characters. +#define MICROPY_ALLOC_PATH_MAX (96) #define MICROPY_CAN_OVERRIDE_BUILTINS (1) #define MICROPY_COMP_CONST (1) #define MICROPY_COMP_DOUBLE_TUPLE_ASSIGN (1) diff --git a/py/runtime.c b/py/runtime.c index 804b955e0745..ebe1590484b7 100644 --- a/py/runtime.c +++ b/py/runtime.c @@ -201,7 +201,7 @@ mp_obj_t MICROPY_WRAP_MP_LOAD_GLOBAL(mp_load_global)(qstr qst) { return elem->value; } -mp_obj_t mp_load_build_class(void) { +mp_obj_t __attribute__((noinline)) mp_load_build_class(void) { DEBUG_OP_printf("load_build_class\n"); #if MICROPY_CAN_OVERRIDE_BUILTINS if (MP_STATE_VM(mp_module_builtins_override_dict) != NULL) { @@ -858,7 +858,7 @@ mp_obj_t mp_call_method_n_kw_var(bool have_self, size_t n_args_n_kw, const mp_ob } // unpacked items are stored in reverse order into the array pointed to by items -void mp_unpack_sequence(mp_obj_t seq_in, size_t num, mp_obj_t *items) { +void __attribute__((noinline,)) mp_unpack_sequence(mp_obj_t seq_in, size_t num, mp_obj_t *items) { size_t seq_len; if (mp_obj_is_type(seq_in, &mp_type_tuple) || mp_obj_is_type(seq_in, &mp_type_list)) { mp_obj_t *seq_items; @@ -905,7 +905,7 @@ void mp_unpack_sequence(mp_obj_t seq_in, size_t num, mp_obj_t *items) { } // unpacked items are stored in reverse order into the array pointed to by items -void mp_unpack_ex(mp_obj_t seq_in, size_t num_in, mp_obj_t *items) { +void __attribute__((noinline)) mp_unpack_ex(mp_obj_t seq_in, size_t num_in, mp_obj_t *items) { size_t num_left = num_in & 0xff; size_t num_right = (num_in >> 8) & 0xff; DEBUG_OP_printf("unpack ex " UINT_FMT " " UINT_FMT "\n", num_left, num_right); @@ -1482,7 +1482,7 @@ mp_obj_t mp_import_name(qstr name, mp_obj_t fromlist, mp_obj_t level) { return mp_builtin___import__(5, args); } -mp_obj_t mp_import_from(mp_obj_t module, qstr name) { +mp_obj_t __attribute__((noinline,)) mp_import_from(mp_obj_t module, qstr name) { DEBUG_printf("import from %p %s\n", module, qstr_str(name)); mp_obj_t dest[2]; @@ -1528,7 +1528,7 @@ mp_obj_t mp_import_from(mp_obj_t module, qstr name) { #endif } -void mp_import_all(mp_obj_t module) { +void __attribute__((noinline)) mp_import_all(mp_obj_t module) { DEBUG_printf("import all %p\n", module); // TODO: Support __all__ diff --git a/tools/gdb-stack-size.py b/tools/gdb-stack-size.py new file mode 100644 index 000000000000..4d3fc9fe08aa --- /dev/null +++ b/tools/gdb-stack-size.py @@ -0,0 +1,64 @@ +"""Source this file into gdb `source ../../tools/gdb-stack-size.py` then run + `stack-size` to print a backtrace with each frame size next to it.""" + + +class StackSize(gdb.Command): + def __init__(self): + super(StackSize, self).__init__("stack-size", gdb.COMMAND_USER) + + def invoke(self, arg, from_tty): + frame = gdb.newest_frame() + total_size = 0 + while frame: + sp = frame.read_register("sp") + frame_up = frame.older() + if not frame_up: + break + f = frame.function() + l = frame.level() + if l < 10: + l = "#" + str(l) + " " + else: + l = "#" + str(l) + size = frame_up.read_register("sp") - sp + total_size += size + print(l, sp, frame.type(), f, " " * (40 - len(str(f))), size) + # print(dir(f)) + # Tweak this if for more detail for a specific function. + if False and f.name == "mp_execute_bytecode": + b = frame.block() + prev_b = None + while not b.is_static: + print(" block", hex(b.start), hex(b.end), b.function) + for sym in b: + if not sym.needs_frame: + continue + v = sym.value(frame) + print(" ", sym.addr_class, v.address, sym.type.sizeof, sym, sym.type, v) + prev_b = b + b = b.superblock + + if b.function == f: + break + b = prev_b + print("pc scan", hex(b.start), hex(b.end)) + seen = set() + for pc in range(b.start, b.end, 2): + b = gdb.block_for_pc(pc) + r = (b.start, b.end) + if r in seen: + continue + seen.add(r) + print(" ", hex(pc), hex(b.start), hex(b.end), b.function) + for sym in b: + if not sym.needs_frame: + continue + # if sym.type.sizeof <= 4: + # continue + v = sym.value(frame) + print(" ", sym.addr_class, v.address, sym.type.sizeof, sym, sym.type, v) + frame = frame_up + print("total size:", total_size) + + +StackSize() diff --git a/tools/stack-loc-to-pc.py b/tools/stack-loc-to-pc.py new file mode 100644 index 000000000000..a1ce788f2b65 --- /dev/null +++ b/tools/stack-loc-to-pc.py @@ -0,0 +1,28 @@ +"""Prints the pcs that access each stack location in a function. Useful for finding + infrequently used stack space. + + Pipe in disassembly like so: + + arm-none-eabi-objdump --disassemble=mp_execute_bytecode build-metro_m0_express/firmware.elf | python ../../tools/stack-loc-to-pc.py + """ + +import sys +import re + +offset = re.compile(r"sp, #(\d+)") + +offsets = {} +for line in sys.stdin: + if "sp" in line: + m = offset.search(line) + o = int(m.groups()[0]) + pc = line.split(":")[0] + if o not in offsets: + offsets[o] = [] + offsets[o].append(pc.strip()) + +print("Offset", "Size", "PCs", sep="\t") +last_o = 0 +for o in sorted(offsets): + print(o, o - last_o, offsets[o], sep="\t") + last_o = o