diff --git a/CHANGELOG.md b/CHANGELOG.md index d4f937bd..87d99fc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,13 @@ * Fix a hang due to incorrect jump-table boundaries inferred from irrelevant register correlations to the index register * Requires gtirb >=2.2.0 -* Improved code inference in ARM binaries: +* Improved code inference: - Do not miss code after literal pools. + - Switch decode mode if invalid instruction found in ARM. + - Fixed bug in pointers to string data blocks. + - Restrict padding blocks so they do not share instructions with code blocks. + - Start a new block if we transition from padding to not padding + - Change the type of several heuristics from "simple" to "proportional" - Additional heuristic: Simple string literals in literal pools - Additional heuristic: Function beginning pattern with push/adjust-sp as plausible instruction sequence * Fix bug that led to string data blocks potentially overlapping code blocks. diff --git a/examples/arm_asm_examples/ex_ldr/ex_ldr.s b/examples/arm_asm_examples/ex_ldr/ex_ldr.s index 2c08694c..547253a1 100644 --- a/examples/arm_asm_examples/ex_ldr/ex_ldr.s +++ b/examples/arm_asm_examples/ex_ldr/ex_ldr.s @@ -178,7 +178,7 @@ thumbfunc: cmp r0, #1 bhi .BHI19 -.INVALID18: +.INVALID18_THUMB: @invalid: ldrd r0, sp, [r1], #4 .short 0xe8f1 .short 0x0d01 @@ -187,7 +187,7 @@ thumbfunc: cmp r0, #1 bhi .BHI20 -.INVALID19: +.INVALID19_THUMB: @invalid: ldrd r0, pc, [r1], #4 .short 0xe8f1 .short 0x0f01 @@ -196,7 +196,7 @@ thumbfunc: cmp r0, #1 bhi .BHI21 -.INVALID20: +.INVALID20_THUMB: @invalid: ldrd sp, r0, [r1], #4 .short 0xe8f1 .short 0xd001 @@ -205,7 +205,7 @@ thumbfunc: cmp r0, #1 bhi .exit_thumb -.INVALID21: +.INVALID21_THUMB: @invalid: ldrd pc, r0, [r1], #4 .short 0xe8f1 .short 0xf001 diff --git a/src/datalog/arch/arm32_code_inference.dl b/src/datalog/arch/arm32_code_inference.dl index b1d9f184..0dc800d3 100644 --- a/src/datalog/arch/arm32_code_inference.dl +++ b/src/datalog/arch/arm32_code_inference.dl @@ -898,18 +898,6 @@ data_block_candidate(Block,Size):- End = Block2 - 3 ). -/** -EA: adr r0, RefAddr -... -RefAddr: .string "..." -*/ -block_heuristic(RefAddr,"data",Size,0,"possible string"), -data_block_candidate(RefAddr,Size):- - arch.pc_relative_addr(EA,_,RefAddr), - code_in_block_candidate(EA,_), - !composite_data_access(EA,_,_,_), - ascii_string(RefAddr,End), - Size = End - RefAddr. /** EA: ldr r0, LitPoolAddr diff --git a/src/datalog/arch/arm32_code_inference_weights.dl b/src/datalog/arch/arm32_code_inference_weights.dl index 599a972c..f7147f2b 100644 --- a/src/datalog/arch/arm32_code_inference_weights.dl +++ b/src/datalog/arch/arm32_code_inference_weights.dl @@ -28,7 +28,6 @@ default_heuristic_weight("arm: jump table: no start",PROPORTIONAL_WEIGHT,2). default_heuristic_weight("arm: jump table: no symbol",PROPORTIONAL_WEIGHT,15). default_heuristic_weight("arm: jump table",PROPORTIONAL_WEIGHT,5). -default_heuristic_weight("possible string",PROPORTIONAL_WEIGHT,1). default_heuristic_weight("possible string: string param for string library",PROPORTIONAL_WEIGHT,1). default_heuristic_weight("possible string: string pred exists",PROPORTIONAL_WEIGHT,1). default_heuristic_weight("possible string: string succ exists",PROPORTIONAL_WEIGHT,1). diff --git a/src/datalog/arch/arm64/interrupt_operations.dl b/src/datalog/arch/arm64/interrupt_operations.dl index b9e221c0..91af2cd0 100644 --- a/src/datalog/arch/arm64/interrupt_operations.dl +++ b/src/datalog/arch/arm64/interrupt_operations.dl @@ -28,3 +28,6 @@ interrupt_operation("INT"). interrupt_operation("INTO"). interrupt_operation("INT1"). interrupt_operation("INT3"). + +// Breakpoint operation +interrupt_operation("BRK"). diff --git a/src/datalog/code_inference.dl b/src/datalog/code_inference.dl index 535fa42d..c48afb1a 100644 --- a/src/datalog/code_inference.dl +++ b/src/datalog/code_inference.dl @@ -377,10 +377,23 @@ block_limit(Inst):- block_limit(Inst):- ( arch.simple_data_load(_,EA,Size); - composite_data_access(_,_,EA,Size) + composite_data_access(_,_,EA,Size); + repeated_byte(EA,_,Size), Size > 8 ), arch.instruction_at(EA+Size,Inst). +/** +We want to split blocks that go from non-padding to padding. +However, this cannot be a regular block_limit because several +instructions could fallthrough into another one. +We need to consider the source address too. +*/ +.decl transition_block_limit(EA:address,Next:address) + +transition_block_limit(EA,Next):- + is_padding(Next), + next(EA,Next), + !is_padding(EA). // The targets are computed incrementally now as we traverse the code // likely_ea and possible_target_from are mutually recursive @@ -425,22 +438,25 @@ code_in_block_candidate(EA,EA):- possible_target(EA), possible_ea(EA). -// extend the block as long as we are sure to fallthrough and we have not +// Extend the block as long as we are sure to fallthrough and we have not // reached a block limit code_in_block_candidate(EA,Start):- code_in_block_candidate(EA2,Start), must_fallthrough(EA2,EA), - !block_limit(EA). + !block_limit(EA), + !transition_block_limit(EA2,EA). -// if reached a block limit or an instruction that does not necessarily +// If reached a block limit or an instruction that does not necessarily // fallthrough continue exploring but start a new block +// Always start a new block if we switch to padding instructions. code_in_block_candidate(EA,EA):- code_in_block_candidate(EA2,_), may_fallthrough(EA2,EA), ( !must_fallthrough(EA2,EA); - block_limit(EA) + block_limit(EA); + transition_block_limit(EA2,EA) ), possible_ea(EA). @@ -472,9 +488,12 @@ This is analogous to code_in_block_candidate, but for blocks of type "padding". nop_in_padding_candidate(EA,EA):- is_padding(EA), - !code_in_block_candidate_refined(EA,EA), + !code_in_block_candidate_refined(EA,_), next(Prev,EA), - !is_padding(Prev). + ( + !is_padding(Prev); + code_in_block_candidate_refined(Prev,_) + ). nop_in_padding_candidate(EA,EA):- padding_block_limit(EA), @@ -529,19 +548,17 @@ after_end(Next,End):- next(End,Next), !instruction_get_operation(Next,"INT3"). -// Search after jump tables -after_end(Next,End):- - relative_address(End,Size,_,_,_,_), Next=End+Size, - !relative_address(Next,_,_,_,_,_), - is_padding(Next). - -// Search after literal pools -after_end(NextThumb,EA), +// Search after jump tables and literal pools after_end(Next,EA):- - binary_isa("ARM"), data_block_candidate(EA,Size), Next = EA + Size, - NextThumb = Next + 1. + !relative_address(Next,_,_,_,_,_). + +// Search thumb code after literal pools +after_end(NextThumb,EA):- + binary_isa("ARM"), + data_block_candidate(EA,Size), + NextThumb = EA + Size + 1. // Propagation through nops after_end(Next,End):- @@ -557,6 +574,14 @@ possible_target(EA):- block_limit(EA) ). +// In ARM, if we find invalid instructions, we try with the other decode mode. +possible_target(SwitchDecodeModeEA):- + binary_isa("ARM"), + after_end(EA,_), + !is_padding(EA), + invalid(EA,_), + SwitchDecodeModeEA = EA bxor 1. + /////////////////////////////////////////////////////////////////////// // We are done with the recursive exploration // Now we detect and resolve conflicts within the different blocks @@ -618,6 +643,7 @@ data_block_candidate(Block,AccessSize):- code_in_block_candidate(EA,_), code_in_block_candidate(EA_load,_). +// Repeated bytes in code section data_block_candidate(EA,Size):- repeated_byte(EA,_,Size), Size > 8, !is_padding(EA). @@ -640,13 +666,37 @@ known_block(Block,"data",4,"relocation"):- !instruction_has_relocation(_,Block), address_in_data(Block,_). +/** +String in code section, either it is referenced +or it is long enough (>8 bytes). + +EA: adr r0, RefAddr +... +RefAddr: .string "..." +*/ +block_heuristic(RefAddr,"data",Size,0,"possible string"), +data_block_candidate(RefAddr,Size):- + code_section(Name), + loaded_section(BegSect,EndSect,Name), + BegSect <= RefAddr, RefAddr < EndSect, + ascii_string(RefAddr,End), + Size = End - RefAddr, + ( + arch.pc_relative_addr(EA,_,RefAddr), + code_in_block_candidate(EA,_), + !composite_data_access(EA,_,_,_) + ; + Size > 8 + ). + +// Pointer to a string data_block_candidate(Addr,Pt_size):- code_section(Name), loaded_section(BegSect,EndSect,Name), BegSect <= Addr, Addr < EndSect, aligned_address_in_data(Addr,Dest), ascii_string(StrBeg,StrEnd), - StrBeg >= Dest, Dest < StrEnd, + StrBeg <= Dest, Dest < StrEnd, arch.pointer_size(Pt_size). /** The block candidate defined by address 'Block' extends from 'BegAddr' to 'EndAddr'. diff --git a/src/datalog/code_inference_postprocess.dl b/src/datalog/code_inference_postprocess.dl index 63548701..8eb82fb1 100644 --- a/src/datalog/code_inference_postprocess.dl +++ b/src/datalog/code_inference_postprocess.dl @@ -188,6 +188,7 @@ padding(EA,Size):- //this condition guarantees that we take the last after_end of a sequence of nops !is_padding(NonNop), next(End,EA), + is_padding(EA), Size = NonNop-EA, Size > 0. diff --git a/src/datalog/code_inference_weights.dl b/src/datalog/code_inference_weights.dl index f6ffaa2e..16b292dd 100644 --- a/src/datalog/code_inference_weights.dl +++ b/src/datalog/code_inference_weights.dl @@ -54,7 +54,10 @@ heuristic_weight(Name,Type,Weight):- // Proportional weights default_heuristic_weight("size",PROPORTIONAL_WEIGHT,5). - +default_heuristic_weight("unresolved-may-fallthrough",PROPORTIONAL_WEIGHT,0). +default_heuristic_weight("resolved-reaches strong",PROPORTIONAL_WEIGHT,2). +default_heuristic_weight("resolved-reaches weak",PROPORTIONAL_WEIGHT,1). +default_heuristic_weight("possible string",PROPORTIONAL_WEIGHT,1). // Simple weights default_heuristic_weight("address in data array",SIMPLE_WEIGHT,1). @@ -85,9 +88,7 @@ default_heuristic_weight("relative-jump-table-start",SIMPLE_WEIGHT,3). default_heuristic_weight("relative jump table target: absolute",SIMPLE_WEIGHT,1). default_heuristic_weight("relative jump table target",SIMPLE_WEIGHT,2). default_heuristic_weight("repeated byte",SIMPLE_WEIGHT,10). -default_heuristic_weight("resolved-reaches strong",SIMPLE_WEIGHT,7). -default_heuristic_weight("resolved-reaches weak",SIMPLE_WEIGHT,3). + default_heuristic_weight("seh handler",SIMPLE_WEIGHT,5). default_heuristic_weight("unresolved-direct-call",SIMPLE_WEIGHT,2). default_heuristic_weight("unresolved-direct-jump",SIMPLE_WEIGHT,2). -default_heuristic_weight("unresolved-may-fallthrough",SIMPLE_WEIGHT,2). diff --git a/tests/arm_misc_test.py b/tests/arm_misc_test.py index 80dd1ff8..06d4be2a 100644 --- a/tests/arm_misc_test.py +++ b/tests/arm_misc_test.py @@ -51,22 +51,27 @@ def test_arm_invalid_ldr(self): ).ir() m = ir_library.modules[0] - main_first_block = None - blocks_are_data = [] for sym in m.symbols: if sym.name == "main": - main_first_block = sym.referent + self.assertTrue(isinstance(sym.referent, gtirb.CodeBlock)) continue if not sym.name.startswith(".INVALID"): continue - blocks_are_data.append( - isinstance(sym.referent, gtirb.DataBlock) - ) - - self.assertTrue(isinstance(main_first_block, gtirb.CodeBlock)) - self.assertTrue(all(blocks_are_data)) - self.assertEqual(len(blocks_are_data), len(invalid_syms)) + if "THUMB" in sym.name: + # The symbol should not point to a thumb code block + self.assertFalse( + isinstance(sym.referent, gtirb.CodeBlock) + and sym.referent.decode_mode + == gtirb.CodeBlock.DecodeMode.Thumb + ) + else: + # The symbol should not point to an arm code block + self.assertFalse( + isinstance(sym.referent, gtirb.CodeBlock) + and sym.referent.decode_mode + == gtirb.CodeBlock.DecodeMode.Default + ) @unittest.skipUnless( platform.system() == "Linux", "This test is linux only."