From 291da9ffe83fcbb78267f2cb739d2ccb80ba7540 Mon Sep 17 00:00:00 2001
From: Valentinas Bakaitis <valentinas.bakaitis@lightspeedhq.com>
Date: Tue, 10 Sep 2024 13:29:25 +1200
Subject: [PATCH 1/2] Fixes issue #90 - produces separate sarif issues for each
 result instance

---
 mobsfscan/formatters/sarif.py | 86 ++++++++++++++++++++++-------------
 1 file changed, 54 insertions(+), 32 deletions(-)

diff --git a/mobsfscan/formatters/sarif.py b/mobsfscan/formatters/sarif.py
index 59b2cd4..0be7495 100644
--- a/mobsfscan/formatters/sarif.py
+++ b/mobsfscan/formatters/sarif.py
@@ -76,14 +76,15 @@ def add_results(path, scan_results, run):
     rule_indices = {}
 
     for rule_id, issue_dict in res.items():
-        result = create_result(path, rule_id, issue_dict, rules, rule_indices)
-        run.results.append(result)
+        results = create_rule_results(path, rule_id, issue_dict, rules, rule_indices)
+        run.results += results
 
     if len(rules) > 0:
         run.tool.driver.rules = list(rules.values())
 
 
-def create_result(path, rule_id, issue_dict, rules, rule_indices):
+def create_rule_results(path, rule_id, issue_dict, rules, rule_indices):
+    rule_results = []
     if rule_id in rules:
         rule = rules[rule_id]
         rule_index = rule_indices[rule_id]
@@ -105,21 +106,41 @@ def create_result(path, rule_id, issue_dict, rules, rule_indices):
         rules[rule_id] = rule
         rule_indices[rule_id] = rule_index
 
-    locations = []
-    for item in issue_dict.get('files', []):
-        physical_location = om.PhysicalLocation(
-            artifact_location=om.ArtifactLocation(
-                uri=to_uri(item['file_path'])),
-        )
-        physical_location.region = om.Region(
-            start_line=item['match_lines'][0],
-            end_line=item['match_lines'][1],
-            start_column=item['match_position'][0],
-            end_column=item['match_position'][1],
-            snippet=om.ArtifactContent(text=item['match_string']),
-        )
-        locations.append(om.Location(physical_location=physical_location))
-    if not locations:
+    files = issue_dict.get('files', [])
+
+    # if there are locations - we iterate over them and create
+    # a separete result for each location
+    if files:
+        for item in files:
+            locations = []
+            physical_location = om.PhysicalLocation(
+                artifact_location=om.ArtifactLocation(
+                    uri=to_uri(item['file_path'])),
+            )
+            physical_location.region = om.Region(
+                start_line=item['match_lines'][0],
+                end_line=item['match_lines'][1],
+                start_column=item['match_position'][0],
+                end_column=item['match_position'][1],
+                snippet=om.ArtifactContent(text=item['match_string']),
+            )
+            locations.append(om.Location(physical_location=physical_location))
+            rule_results.append(om.Result(
+                    rule_id=rule.id,
+                    rule_index=rule_index,
+                    message=om.Message(text=issue_dict['metadata']['description']),
+                    level=level_from_severity(issue_dict['metadata']['severity']),
+                    locations=locations,
+                    properties={
+                        'owasp-mobile': issue_dict['metadata']['owasp-mobile'],
+                        'masvs': issue_dict['metadata']['masvs'],
+                        'cwe': issue_dict['metadata']['cwe'],
+                        'reference': issue_dict['metadata']['reference'],
+                    },
+                ))
+    # if there are no locations - only create a single resuklt
+    else:
+        locations = []
         artifact = om.PhysicalLocation(
             artifact_location=om.ArtifactLocation(
                 uri=path[0]),
@@ -132,20 +153,21 @@ def create_result(path, rule_id, issue_dict, rules, rule_indices):
             snippet=om.ArtifactContent(text='Missing Best Practice'),
         )
         locations.append(om.Location(physical_location=artifact))
-
-    return om.Result(
-        rule_id=rule.id,
-        rule_index=rule_index,
-        message=om.Message(text=issue_dict['metadata']['description']),
-        level=level_from_severity(issue_dict['metadata']['severity']),
-        locations=locations,
-        properties={
-            'owasp-mobile': issue_dict['metadata']['owasp-mobile'],
-            'masvs': issue_dict['metadata']['masvs'],
-            'cwe': issue_dict['metadata']['cwe'],
-            'reference': issue_dict['metadata']['reference'],
-        },
-    )
+        rule_results.append(om.Result(
+                rule_id=rule.id,
+                rule_index=rule_index,
+                message=om.Message(text=issue_dict['metadata']['description']),
+                level=level_from_severity(issue_dict['metadata']['severity']),
+                locations=locations,
+                properties={
+                    'owasp-mobile': issue_dict['metadata']['owasp-mobile'],
+                    'masvs': issue_dict['metadata']['masvs'],
+                    'cwe': issue_dict['metadata']['cwe'],
+                    'reference': issue_dict['metadata']['reference'],
+                },
+            ))
+
+    return rule_results
 
 
 def sarif_output(outfile, scan_results, mobsfscan_version, path):

From 3d3fca46da11a05f71d57425f38d18e360c263d3 Mon Sep 17 00:00:00 2001
From: Valentinas Bakaitis <valentinas.bakaitis@lightspeedhq.com>
Date: Wed, 11 Sep 2024 12:03:24 +1200
Subject: [PATCH 2/2] deduplicate findings before returning them

---
 mobsfscan/mobsfscan.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/mobsfscan/mobsfscan.py b/mobsfscan/mobsfscan.py
index e4f677d..c924fbb 100644
--- a/mobsfscan/mobsfscan.py
+++ b/mobsfscan/mobsfscan.py
@@ -127,6 +127,20 @@ def format_output(self, results) -> dict:
         self.post_ignore_rules()
         self.post_ignore_rules_by_severity()
         self.post_ignore_files()
+        self.deduplicate_files()
+
+    def deduplicate_files(self):
+        for _, details in self.result['results'].items():
+            files = details.get('files')
+            # some results don't have any files, so we need to check before we continue
+            if files:
+                # "file" here refers to the dictionary containig the file_path, match_lines, etc.
+                # for each file we create a tuple with it's contents
+                # then using those tuples as keys and "file" as values we create a dictionary
+                # This means that for each unique "file" we will get only one entry as we can't have duplicate keys
+                # Once this is done - convert the dictionary back to list by grabbing it's values and passing it to list()
+                unique_files = list({tuple(sorted(f.items())): f for f in files}.values())
+                details['files'] = unique_files
 
     def format_semgrep(self, sgrep_output):
         """Format semgrep output."""