Skip to content

Commit

Permalink
Merge branch 'additional-code-inference-changes' into 'main'
Browse files Browse the repository at this point in the history
additional code inference changes

See merge request rewriting/ddisasm!1218
  • Loading branch information
aeflores committed Oct 1, 2024
2 parents e1f3ca6 + 6024859 commit 4c7ed39
Show file tree
Hide file tree
Showing 9 changed files with 102 additions and 50 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,13 @@

* Fix a hang due to incorrect jump-table boundaries inferred from irrelevant register correlations to the index register
* Requires gtirb >=2.2.0
* Improved code inference in ARM binaries:
* Improved code inference:
- Do not miss code after literal pools.
- Switch decode mode if invalid instruction found in ARM.
- Fixed bug in pointers to string data blocks.
- Restrict padding blocks so they do not share instructions with code blocks.
- Start a new block if we transition from padding to not padding
- Change the type of several heuristics from "simple" to "proportional"
- Additional heuristic: Simple string literals in literal pools
- Additional heuristic: Function beginning pattern with push/adjust-sp as plausible instruction sequence
* Fix bug that led to string data blocks potentially overlapping code blocks.
Expand Down
8 changes: 4 additions & 4 deletions examples/arm_asm_examples/ex_ldr/ex_ldr.s
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ thumbfunc:
cmp r0, #1
bhi .BHI19

.INVALID18:
.INVALID18_THUMB:
@invalid: ldrd r0, sp, [r1], #4
.short 0xe8f1
.short 0x0d01
Expand All @@ -187,7 +187,7 @@ thumbfunc:
cmp r0, #1
bhi .BHI20

.INVALID19:
.INVALID19_THUMB:
@invalid: ldrd r0, pc, [r1], #4
.short 0xe8f1
.short 0x0f01
Expand All @@ -196,7 +196,7 @@ thumbfunc:
cmp r0, #1
bhi .BHI21

.INVALID20:
.INVALID20_THUMB:
@invalid: ldrd sp, r0, [r1], #4
.short 0xe8f1
.short 0xd001
Expand All @@ -205,7 +205,7 @@ thumbfunc:
cmp r0, #1
bhi .exit_thumb

.INVALID21:
.INVALID21_THUMB:
@invalid: ldrd pc, r0, [r1], #4
.short 0xe8f1
.short 0xf001
Expand Down
12 changes: 0 additions & 12 deletions src/datalog/arch/arm32_code_inference.dl
Original file line number Diff line number Diff line change
Expand Up @@ -898,18 +898,6 @@ data_block_candidate(Block,Size):-
End = Block2 - 3
).

/**
EA: adr r0, RefAddr
...
RefAddr: .string "..."
*/
block_heuristic(RefAddr,"data",Size,0,"possible string"),
data_block_candidate(RefAddr,Size):-
arch.pc_relative_addr(EA,_,RefAddr),
code_in_block_candidate(EA,_),
!composite_data_access(EA,_,_,_),
ascii_string(RefAddr,End),
Size = End - RefAddr.

/**
EA: ldr r0, LitPoolAddr
Expand Down
1 change: 0 additions & 1 deletion src/datalog/arch/arm32_code_inference_weights.dl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ default_heuristic_weight("arm: jump table: no start",PROPORTIONAL_WEIGHT,2).
default_heuristic_weight("arm: jump table: no symbol",PROPORTIONAL_WEIGHT,15).
default_heuristic_weight("arm: jump table",PROPORTIONAL_WEIGHT,5).

default_heuristic_weight("possible string",PROPORTIONAL_WEIGHT,1).
default_heuristic_weight("possible string: string param for string library",PROPORTIONAL_WEIGHT,1).
default_heuristic_weight("possible string: string pred exists",PROPORTIONAL_WEIGHT,1).
default_heuristic_weight("possible string: string succ exists",PROPORTIONAL_WEIGHT,1).
Expand Down
3 changes: 3 additions & 0 deletions src/datalog/arch/arm64/interrupt_operations.dl
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ interrupt_operation("INT").
interrupt_operation("INTO").
interrupt_operation("INT1").
interrupt_operation("INT3").

// Breakpoint operation
interrupt_operation("BRK").
86 changes: 68 additions & 18 deletions src/datalog/code_inference.dl
Original file line number Diff line number Diff line change
Expand Up @@ -377,10 +377,23 @@ block_limit(Inst):-
block_limit(Inst):-
(
arch.simple_data_load(_,EA,Size);
composite_data_access(_,_,EA,Size)
composite_data_access(_,_,EA,Size);
repeated_byte(EA,_,Size), Size > 8
),
arch.instruction_at(EA+Size,Inst).

/**
We want to split blocks that go from non-padding to padding.
However, this cannot be a regular block_limit because several
instructions could fallthrough into another one.
We need to consider the source address too.
*/
.decl transition_block_limit(EA:address,Next:address)

transition_block_limit(EA,Next):-
is_padding(Next),
next(EA,Next),
!is_padding(EA).

// The targets are computed incrementally now as we traverse the code
// likely_ea and possible_target_from are mutually recursive
Expand Down Expand Up @@ -425,22 +438,25 @@ code_in_block_candidate(EA,EA):-
possible_target(EA),
possible_ea(EA).

// extend the block as long as we are sure to fallthrough and we have not
// Extend the block as long as we are sure to fallthrough and we have not
// reached a block limit
code_in_block_candidate(EA,Start):-
code_in_block_candidate(EA2,Start),
must_fallthrough(EA2,EA),
!block_limit(EA).
!block_limit(EA),
!transition_block_limit(EA2,EA).


// if reached a block limit or an instruction that does not necessarily
// If reached a block limit or an instruction that does not necessarily
// fallthrough continue exploring but start a new block
// Always start a new block if we switch to padding instructions.
code_in_block_candidate(EA,EA):-
code_in_block_candidate(EA2,_),
may_fallthrough(EA2,EA),
(
!must_fallthrough(EA2,EA);
block_limit(EA)
block_limit(EA);
transition_block_limit(EA2,EA)
),
possible_ea(EA).

Expand Down Expand Up @@ -472,9 +488,12 @@ This is analogous to code_in_block_candidate, but for blocks of type "padding".

nop_in_padding_candidate(EA,EA):-
is_padding(EA),
!code_in_block_candidate_refined(EA,EA),
!code_in_block_candidate_refined(EA,_),
next(Prev,EA),
!is_padding(Prev).
(
!is_padding(Prev);
code_in_block_candidate_refined(Prev,_)
).

nop_in_padding_candidate(EA,EA):-
padding_block_limit(EA),
Expand Down Expand Up @@ -529,19 +548,17 @@ after_end(Next,End):-
next(End,Next),
!instruction_get_operation(Next,"INT3").

// Search after jump tables
after_end(Next,End):-
relative_address(End,Size,_,_,_,_), Next=End+Size,
!relative_address(Next,_,_,_,_,_),
is_padding(Next).

// Search after literal pools
after_end(NextThumb,EA),
// Search after jump tables and literal pools
after_end(Next,EA):-
binary_isa("ARM"),
data_block_candidate(EA,Size),
Next = EA + Size,
NextThumb = Next + 1.
!relative_address(Next,_,_,_,_,_).

// Search thumb code after literal pools
after_end(NextThumb,EA):-
binary_isa("ARM"),
data_block_candidate(EA,Size),
NextThumb = EA + Size + 1.

// Propagation through nops
after_end(Next,End):-
Expand All @@ -557,6 +574,14 @@ possible_target(EA):-
block_limit(EA)
).

// In ARM, if we find invalid instructions, we try with the other decode mode.
possible_target(SwitchDecodeModeEA):-
binary_isa("ARM"),
after_end(EA,_),
!is_padding(EA),
invalid(EA,_),
SwitchDecodeModeEA = EA bxor 1.

///////////////////////////////////////////////////////////////////////
// We are done with the recursive exploration
// Now we detect and resolve conflicts within the different blocks
Expand Down Expand Up @@ -618,6 +643,7 @@ data_block_candidate(Block,AccessSize):-
code_in_block_candidate(EA,_),
code_in_block_candidate(EA_load,_).

// Repeated bytes in code section
data_block_candidate(EA,Size):-
repeated_byte(EA,_,Size), Size > 8,
!is_padding(EA).
Expand All @@ -640,13 +666,37 @@ known_block(Block,"data",4,"relocation"):-
!instruction_has_relocation(_,Block),
address_in_data(Block,_).

/**
String in code section, either it is referenced
or it is long enough (>8 bytes).

EA: adr r0, RefAddr
...
RefAddr: .string "..."
*/
block_heuristic(RefAddr,"data",Size,0,"possible string"),
data_block_candidate(RefAddr,Size):-
code_section(Name),
loaded_section(BegSect,EndSect,Name),
BegSect <= RefAddr, RefAddr < EndSect,
ascii_string(RefAddr,End),
Size = End - RefAddr,
(
arch.pc_relative_addr(EA,_,RefAddr),
code_in_block_candidate(EA,_),
!composite_data_access(EA,_,_,_)
;
Size > 8
).

// Pointer to a string
data_block_candidate(Addr,Pt_size):-
code_section(Name),
loaded_section(BegSect,EndSect,Name),
BegSect <= Addr, Addr < EndSect,
aligned_address_in_data(Addr,Dest),
ascii_string(StrBeg,StrEnd),
StrBeg >= Dest, Dest < StrEnd,
StrBeg <= Dest, Dest < StrEnd,
arch.pointer_size(Pt_size).
/**
The block candidate defined by address 'Block' extends from 'BegAddr' to 'EndAddr'.
Expand Down
1 change: 1 addition & 0 deletions src/datalog/code_inference_postprocess.dl
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ padding(EA,Size):-
//this condition guarantees that we take the last after_end of a sequence of nops
!is_padding(NonNop),
next(End,EA),
is_padding(EA),
Size = NonNop-EA,
Size > 0.

Expand Down
9 changes: 5 additions & 4 deletions src/datalog/code_inference_weights.dl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,10 @@ heuristic_weight(Name,Type,Weight):-

// Proportional weights
default_heuristic_weight("size",PROPORTIONAL_WEIGHT,5).

default_heuristic_weight("unresolved-may-fallthrough",PROPORTIONAL_WEIGHT,0).
default_heuristic_weight("resolved-reaches strong",PROPORTIONAL_WEIGHT,2).
default_heuristic_weight("resolved-reaches weak",PROPORTIONAL_WEIGHT,1).
default_heuristic_weight("possible string",PROPORTIONAL_WEIGHT,1).

// Simple weights
default_heuristic_weight("address in data array",SIMPLE_WEIGHT,1).
Expand Down Expand Up @@ -85,9 +88,7 @@ default_heuristic_weight("relative-jump-table-start",SIMPLE_WEIGHT,3).
default_heuristic_weight("relative jump table target: absolute",SIMPLE_WEIGHT,1).
default_heuristic_weight("relative jump table target",SIMPLE_WEIGHT,2).
default_heuristic_weight("repeated byte",SIMPLE_WEIGHT,10).
default_heuristic_weight("resolved-reaches strong",SIMPLE_WEIGHT,7).
default_heuristic_weight("resolved-reaches weak",SIMPLE_WEIGHT,3).

default_heuristic_weight("seh handler",SIMPLE_WEIGHT,5).
default_heuristic_weight("unresolved-direct-call",SIMPLE_WEIGHT,2).
default_heuristic_weight("unresolved-direct-jump",SIMPLE_WEIGHT,2).
default_heuristic_weight("unresolved-may-fallthrough",SIMPLE_WEIGHT,2).
25 changes: 15 additions & 10 deletions tests/arm_misc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,22 +51,27 @@ def test_arm_invalid_ldr(self):
).ir()
m = ir_library.modules[0]

main_first_block = None
blocks_are_data = []
for sym in m.symbols:
if sym.name == "main":
main_first_block = sym.referent
self.assertTrue(isinstance(sym.referent, gtirb.CodeBlock))
continue
if not sym.name.startswith(".INVALID"):
continue

blocks_are_data.append(
isinstance(sym.referent, gtirb.DataBlock)
)

self.assertTrue(isinstance(main_first_block, gtirb.CodeBlock))
self.assertTrue(all(blocks_are_data))
self.assertEqual(len(blocks_are_data), len(invalid_syms))
if "THUMB" in sym.name:
# The symbol should not point to a thumb code block
self.assertFalse(
isinstance(sym.referent, gtirb.CodeBlock)
and sym.referent.decode_mode
== gtirb.CodeBlock.DecodeMode.Thumb
)
else:
# The symbol should not point to an arm code block
self.assertFalse(
isinstance(sym.referent, gtirb.CodeBlock)
and sym.referent.decode_mode
== gtirb.CodeBlock.DecodeMode.Default
)

@unittest.skipUnless(
platform.system() == "Linux", "This test is linux only."
Expand Down

0 comments on commit 4c7ed39

Please sign in to comment.