From a40c0aad27c9d88420409d22d0f23091b7e39728 Mon Sep 17 00:00:00 2001
From: Tijme Gommers <tijme@device.local>
Date: Thu, 24 May 2018 15:12:30 +0200
Subject: [PATCH] Fixed stack overflow in regex recursion.

---
 .semver                              |   2 +-
 extension/ExtensionDetails.py        |   2 +-
 extension/GraphWave.py               |   2 +-
 extension/GraphWaveDocumentParser.py | 124 +++++++++++++++++++++
 extension/GraphWaveHttpListener.py   |   4 +-
 extension/GraphWaveResponse.py       |   8 +-
 extension/GraphWaveSimilarity.py     | 158 ++++++++++-----------------
 7 files changed, 193 insertions(+), 107 deletions(-)
 create mode 100644 extension/GraphWaveDocumentParser.py

diff --git a/.semver b/.semver
index 7dea76e..6d7de6e 100644
--- a/.semver
+++ b/.semver
@@ -1 +1 @@
-1.0.1
+1.0.2
diff --git a/extension/ExtensionDetails.py b/extension/ExtensionDetails.py
index 3210eda..ea339b9 100644
--- a/extension/ExtensionDetails.py
+++ b/extension/ExtensionDetails.py
@@ -41,7 +41,7 @@ class ExtensionDetails:
 
     VERSION = "Unknown"
 
-    DEBUG = True
+    DEBUG = False
 
     STATUS_LOADING = "loading"
     STATUS_DISABLED = "disabled"
diff --git a/extension/GraphWave.py b/extension/GraphWave.py
index b9d4d17..60b18fa 100644
--- a/extension/GraphWave.py
+++ b/extension/GraphWave.py
@@ -222,7 +222,7 @@ def getMatchingPoints(self, response, properties):
         stylometry_value = 0
 
         for response_in_graph in self.responses():
-            similarity = response.getSimilarity(response_in_graph)
+            similarity = response.getSimilarity(response_in_graph, response.url)
             if similarity > self.options["mst"]:
                 stylometry_count += 1
                 stylometry_value += similarity
diff --git a/extension/GraphWaveDocumentParser.py b/extension/GraphWaveDocumentParser.py
new file mode 100644
index 0000000..3bcbd4a
--- /dev/null
+++ b/extension/GraphWaveDocumentParser.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+# MIT License
+#
+# Copyright (c) 2018 Tijme Gommers
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+class GraphWaveDocumentParser:
+    """A helper class for extracting certain characteristic from HTML documents."""
+
+    def __init__(self, document):
+        """Initialize the document parser with the given HTML document string.
+
+        Args:
+            document (str): The HTML document to parse.
+
+        """
+
+        self.document = document
+        self.parsed = False
+
+        self.is_in_element = False
+        self.is_in_attribute = False
+        self.is_in_class_attribute = False
+
+        self.class_attributes = {}
+        self.classes = []
+        self.elements = {}
+        self.element_index = 0
+
+        self.attribute_quotes = ["\"", "'", "`"]
+        self.attribute_quote = None
+
+        self.parse()
+
+    def addToIndex(self, arr, indx, char):
+        """Append a certain character to an element on the given index.
+
+        Args:
+            arr (obj): The dict to add the char to.
+            indx (int): The index to place the character on.
+            char (str): The character to append.
+
+        """
+
+        if indx in arr:
+            arr[indx] += char
+        else:
+            arr[indx] = char
+
+    def parse(self):
+        """Parse the document by extracting tags and classes."""
+
+        if self.parsed:
+            return
+
+        for character in self.document:
+            if not self.is_in_element and character == "<":
+                self.is_in_element = True
+
+            if self.is_in_element:
+                self.addToIndex(self.elements, self.element_index, character)
+
+            if self.is_in_attribute and character == self.attribute_quote:
+                self.is_in_attribute = False
+                self.is_in_class_attribute = False
+                self.attribute_quote = None
+            else:
+                if self.is_in_class_attribute:
+                    self.addToIndex(self.class_attributes, self.element_index, character)
+
+                if not self.is_in_attribute and self.is_in_element and character in self.attribute_quotes:
+                    self.attribute_quote = character
+                    self.is_in_attribute = True
+
+                    if self.elements[self.element_index][-7:-2] == "class":
+                        self.is_in_class_attribute = True
+
+            if self.is_in_element and character == ">" and not self.is_in_attribute:
+                self.is_in_element = False
+                self.element_index += 1
+
+        for class_attribute in self.class_attributes.values():
+            if class_attribute:
+                self.classes.extend(class_attribute.split(" "))
+
+        self.parsed = True
+
+    def getTags(self):
+        """Return HTML elements/tags of the parsed document.
+
+        Returns:
+            list(str): The HTML elements/tags of the parsed document.
+
+        """
+
+        return self.elements.values()
+
+    def getClasses(self):
+        """Return CSS classes of the parsed document.
+
+        Returns:
+            list(str): The CSS classes of the parsed document.
+
+        """
+
+        return self.classes
diff --git a/extension/GraphWaveHttpListener.py b/extension/GraphWaveHttpListener.py
index 2c9f920..c1eab6d 100644
--- a/extension/GraphWaveHttpListener.py
+++ b/extension/GraphWaveHttpListener.py
@@ -86,7 +86,7 @@ def processHttpMessage(self, toolFlag, messageIsRequest, requestResponse):
         response = self._helpers.analyzeResponse(requestResponse.getResponse())
         html = self._helpers.bytesToString(requestResponse.getResponse())
 
-        self._lock.acquire()
+        # self._lock.acquire()
 
         if self.shouldContinueWithMessage(request, response, html):
             response = GraphWaveResponse(request.getUrl().toString(), html)
@@ -99,7 +99,7 @@ def processHttpMessage(self, toolFlag, messageIsRequest, requestResponse):
             self._config.include(request.getUrl().toString())
 
         self._refreshInterface()
-        self._lock.release()
+        # self._lock.release()
 
     def shouldContinueWithMessage(self, request, response, html):
         """Check if a message could be ignored. A message can't be ignored if
diff --git a/extension/GraphWaveResponse.py b/extension/GraphWaveResponse.py
index 00b352f..f6c66a3 100644
--- a/extension/GraphWaveResponse.py
+++ b/extension/GraphWaveResponse.py
@@ -89,7 +89,7 @@ def __ne__(self, other):
 
         return str(self) != str(other)
 
-    def getSimilarity(self, other):
+    def getSimilarity(self, other, url):
         """Check if the given response is similar to this response.
 
         Returns:
@@ -97,7 +97,9 @@ def getSimilarity(self, other):
 
         """
 
-        structural = GraphWaveSimilarity.getStructuralSimilarity(self.html, other.html)
-        style = GraphWaveSimilarity.getStyleSimilarity(self.html, other.html)
+        similarity = GraphWaveSimilarity(self.html, other.html)
+
+        structural = similarity.getStructuralSimilarity()
+        style = similarity.getStyleSimilarity()
 
         return (0.80 * structural) + (0.20 * style)
diff --git a/extension/GraphWaveSimilarity.py b/extension/GraphWaveSimilarity.py
index c6c8a2c..4aa4bb9 100644
--- a/extension/GraphWaveSimilarity.py
+++ b/extension/GraphWaveSimilarity.py
@@ -23,84 +23,80 @@
 # SOFTWARE.
 
 import difflib
-import re
 import hashlib
 
+from GraphWaveDocumentParser import GraphWaveDocumentParser
+
 class GraphWaveSimilarity:
     """The GraphWaveHttpListener listens to all spider packages flowing through Burp Suite.
 
     Attributes:
-        tags_regex (obj): A regular expression that helps to extract HTML tags.
-        classes_regex (obj): A regular expression that helps to extract HTML classes.
-        structural_cache dict(obj): A key/value cache for HTML structures.
-        style_cache dict(obj): A key/value cache for HTML classes.
+        cache dict(obj): A key/value cache for all kinds of slow functionality.
 
     """
 
-    tags_regex = re.compile("<([a-zA-Z0-9]+)(([^<>])+)?>")
+    cache = {}
+
+    def __init__(self, document1, document2):
+        """Initialzie the similarityh measure class.
+
+        Args:
+            document1 (str): The first document to measure.
+            document2 (str): The second document to measure.
 
-    classes_regex = re.compile("class=(['\"`])(.+?)\1")
+        """
 
-    structural_cache = {}
+        self.document1 = document1.encode('utf-8')
+        self.document2 = document2.encode('utf-8')
 
-    style_cache = {}
+        self.parser1 = self.doCache("parser", self.getHashOf(self.document1), lambda : GraphWaveDocumentParser(self.document1))
+        self.parser2 = self.doCache("parser", self.getHashOf(self.document2), lambda : GraphWaveDocumentParser(self.document2))
 
-    @staticmethod
-    def getJaccardSimilarity(set1, set2):
-        """Get the Jaccard distance between two sets.
+    def getHashOf(self, document):
+        """Get the hash for dict indexes.
 
         Args:
-            set1 (set): The first set to measure.
-            set2 (set): The second set to measure.
+            document (str): The document to hash.
 
         Returns:
-            (float): The Jaccard distance between the two sets.
+            str: The hash of the given document.
 
         """
 
-        set1 = set(set1)
-        set2 = set(set2)
+        hash_object = hashlib.md5(document)
+        return hash_object.hexdigest()
 
-        intersection = len(set1 & set2)
-
-        if len(set1) == 0 and len(set2) == 0:
-            return 1.0
-
-        denominator = len(set1) + len(set2) - intersection
-        return intersection / max(denominator, 0.000001)
-
-    @staticmethod
-    def getStructuralSimilarity(document1, document2):
-        """Get the structural similarity between two documents.
+    def doCache(self, store, key, callback):
+        """Cache the given lambda in a store using the given key.
 
         Args:
-            document1 (str): The first document to measure.
-            document2 (str): The second document to measure.
+            store (str): The cache store to use.
+            key (str): The index key (usually a hash).
+            callback (lambda): The callback containing the value.
 
         Returns:
-            (float): The structural similarity between the two documents.
+            obj: The cached value of the callback function.
 
         """
 
-        # SET 1
-        hash_object1 = hashlib.md5(document1.encode('utf-8'))
-        hash_str1 = hash_object1.hexdigest()
+        if not store in GraphWaveSimilarity.cache.keys():
+            GraphWaveSimilarity.cache[store] = {}
+
+        if not key in GraphWaveSimilarity.cache[store].keys():
+            GraphWaveSimilarity.cache[store][key] = callback()
+
+        return GraphWaveSimilarity.cache[store][key]
+
+    def getStructuralSimilarity(self):
+        """Get the structural similarity between two documents.
 
-        if hash_str1 in GraphWaveSimilarity.structural_cache.keys():
-            tags1 = GraphWaveSimilarity.structural_cache[hash_str1]
-        else:
-            tags1 = GraphWaveSimilarity.getTagsFromDocument(document1.encode('utf-8'))
-            GraphWaveSimilarity.structural_cache[hash_str1] = tags1
+        Returns:
+            (float): The structural similarity between the two documents.
 
-        # SET 2
-        hash_object2 = hashlib.md5(document2.encode('utf-8'))
-        hash_str2 = hash_object2.hexdigest()
+        """
 
-        if hash_str2 in GraphWaveSimilarity.structural_cache.keys():
-            tags2 = GraphWaveSimilarity.structural_cache[hash_str2]
-        else:
-            tags2 = GraphWaveSimilarity.getTagsFromDocument(document2.encode('utf-8'))
-            GraphWaveSimilarity.structural_cache[hash_str2] = tags2
+        tags1 = self.doCache("structural", self.getHashOf(self.document1), lambda : self.parser1.getTags())
+        tags2 = self.doCache("structural", self.getHashOf(self.document2), lambda : self.parser2.getTags())
 
         diff = difflib.SequenceMatcher()
 
@@ -109,74 +105,38 @@ def getStructuralSimilarity(document1, document2):
 
         return diff.real_quick_ratio()
 
-    @staticmethod
-    def getStyleSimilarity(document1, document2):
+    def getStyleSimilarity(self):
         """Get the style similarity between two documents.
 
-        Args:
-            document1 (str): The first document to measure.
-            document2 (str): The second document to measure.
-
         Returns:
             (float): The style similarity between the two documents.
 
         """
 
-        # SET 1
-        hash_object1 = hashlib.md5(document1.encode('utf-8'))
-        hash_str1 = hash_object1.hexdigest()
-
-        if hash_str1 in GraphWaveSimilarity.style_cache.keys():
-            classes_page1 = GraphWaveSimilarity.style_cache[hash_str1]
-        else:
-            GraphWaveSimilarity.style_cache[hash_str1] = GraphWaveSimilarity.getClassesFromDocument(document1)
-            classes_page1 = GraphWaveSimilarity.style_cache[hash_str1]
+        classes_page1 = self.doCache("style", self.getHashOf(self.document1), lambda : self.parser1.getClasses())
+        classes_page2 = self.doCache("style", self.getHashOf(self.document2), lambda : self.parser2.getClasses())
 
-        # SET 2
-        hash_object2 = hashlib.md5(document2.encode('utf-8'))
-        hash_str2 = hash_object2.hexdigest()
-
-        if hash_str2 in GraphWaveSimilarity.style_cache.keys():
-            classes_page2 = GraphWaveSimilarity.style_cache[hash_str2]
-        else:
-            GraphWaveSimilarity.style_cache[hash_str2] = GraphWaveSimilarity.getClassesFromDocument(document2)
-            classes_page2 = GraphWaveSimilarity.style_cache[hash_str2]
-
-        return GraphWaveSimilarity.getJaccardSimilarity(classes_page1, classes_page2)
-
-    @staticmethod
-    def getTagsFromDocument(document):
-        """Get the HTML tags from a document.
-
-        Args:
-            document (str): The document to get HTML tags from.
-
-        Returns:
-            list(str): The tags/elements in the document.
+        return self.getJaccardSimilarity(classes_page1, classes_page2)
 
-        """
-
-        results = re.findall(GraphWaveSimilarity.tags_regex, document)
-        return list(results)
-
-    @staticmethod
-    def getClassesFromDocument(document):
-        """Get the HTML style classes from a document.
+    def getJaccardSimilarity(self, set1, set2):
+        """Get the Jaccard distance between two sets.
 
         Args:
-            document (str): The document to get HTML style classes from.
+            set1 (set): The first set to measure.
+            set2 (set): The second set to measure.
 
         Returns:
-            set(str): The style classes in the document.
+            (float): The Jaccard distance between the two sets.
 
         """
 
-        style_class_strings = re.findall(GraphWaveSimilarity.classes_regex, document)
+        set1 = set(set1)
+        set2 = set(set2)
 
-        result = set()
+        intersection = len(set1 & set2)
 
-        for style_class_string in style_class_strings:
-            for style_class in style_class_string.split():
-                result.add(style_class)
+        if len(set1) == 0 and len(set2) == 0:
+            return 1.0
 
-        return result
+        denominator = len(set1) + len(set2) - intersection
+        return intersection / max(denominator, 0.000001)