From a40c0aad27c9d88420409d22d0f23091b7e39728 Mon Sep 17 00:00:00 2001 From: Tijme Gommers Date: Thu, 24 May 2018 15:12:30 +0200 Subject: [PATCH] Fixed stack overflow in regex recursion. --- .semver | 2 +- extension/ExtensionDetails.py | 2 +- extension/GraphWave.py | 2 +- extension/GraphWaveDocumentParser.py | 124 +++++++++++++++++++++ extension/GraphWaveHttpListener.py | 4 +- extension/GraphWaveResponse.py | 8 +- extension/GraphWaveSimilarity.py | 158 ++++++++++----------------- 7 files changed, 193 insertions(+), 107 deletions(-) create mode 100644 extension/GraphWaveDocumentParser.py diff --git a/.semver b/.semver index 7dea76e..6d7de6e 100644 --- a/.semver +++ b/.semver @@ -1 +1 @@ -1.0.1 +1.0.2 diff --git a/extension/ExtensionDetails.py b/extension/ExtensionDetails.py index 3210eda..ea339b9 100644 --- a/extension/ExtensionDetails.py +++ b/extension/ExtensionDetails.py @@ -41,7 +41,7 @@ class ExtensionDetails: VERSION = "Unknown" - DEBUG = True + DEBUG = False STATUS_LOADING = "loading" STATUS_DISABLED = "disabled" diff --git a/extension/GraphWave.py b/extension/GraphWave.py index b9d4d17..60b18fa 100644 --- a/extension/GraphWave.py +++ b/extension/GraphWave.py @@ -222,7 +222,7 @@ def getMatchingPoints(self, response, properties): stylometry_value = 0 for response_in_graph in self.responses(): - similarity = response.getSimilarity(response_in_graph) + similarity = response.getSimilarity(response_in_graph, response.url) if similarity > self.options["mst"]: stylometry_count += 1 stylometry_value += similarity diff --git a/extension/GraphWaveDocumentParser.py b/extension/GraphWaveDocumentParser.py new file mode 100644 index 0000000..3bcbd4a --- /dev/null +++ b/extension/GraphWaveDocumentParser.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +# MIT License +# +# Copyright (c) 2018 Tijme Gommers +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +class GraphWaveDocumentParser: + """A helper class for extracting certain characteristic from HTML documents.""" + + def __init__(self, document): + """Initialize the document parser with the given HTML document string. + + Args: + document (str): The HTML document to parse. + + """ + + self.document = document + self.parsed = False + + self.is_in_element = False + self.is_in_attribute = False + self.is_in_class_attribute = False + + self.class_attributes = {} + self.classes = [] + self.elements = {} + self.element_index = 0 + + self.attribute_quotes = ["\"", "'", "`"] + self.attribute_quote = None + + self.parse() + + def addToIndex(self, arr, indx, char): + """Append a certain character to an element on the given index. + + Args: + arr (obj): The dict to add the char to. + indx (int): The index to place the character on. + char (str): The character to append. + + """ + + if indx in arr: + arr[indx] += char + else: + arr[indx] = char + + def parse(self): + """Parse the document by extracting tags and classes.""" + + if self.parsed: + return + + for character in self.document: + if not self.is_in_element and character == "<": + self.is_in_element = True + + if self.is_in_element: + self.addToIndex(self.elements, self.element_index, character) + + if self.is_in_attribute and character == self.attribute_quote: + self.is_in_attribute = False + self.is_in_class_attribute = False + self.attribute_quote = None + else: + if self.is_in_class_attribute: + self.addToIndex(self.class_attributes, self.element_index, character) + + if not self.is_in_attribute and self.is_in_element and character in self.attribute_quotes: + self.attribute_quote = character + self.is_in_attribute = True + + if self.elements[self.element_index][-7:-2] == "class": + self.is_in_class_attribute = True + + if self.is_in_element and character == ">" and not self.is_in_attribute: + self.is_in_element = False + self.element_index += 1 + + for class_attribute in self.class_attributes.values(): + if class_attribute: + self.classes.extend(class_attribute.split(" ")) + + self.parsed = True + + def getTags(self): + """Return HTML elements/tags of the parsed document. + + Returns: + list(str): The HTML elements/tags of the parsed document. + + """ + + return self.elements.values() + + def getClasses(self): + """Return CSS classes of the parsed document. + + Returns: + list(str): The CSS classes of the parsed document. + + """ + + return self.classes diff --git a/extension/GraphWaveHttpListener.py b/extension/GraphWaveHttpListener.py index 2c9f920..c1eab6d 100644 --- a/extension/GraphWaveHttpListener.py +++ b/extension/GraphWaveHttpListener.py @@ -86,7 +86,7 @@ def processHttpMessage(self, toolFlag, messageIsRequest, requestResponse): response = self._helpers.analyzeResponse(requestResponse.getResponse()) html = self._helpers.bytesToString(requestResponse.getResponse()) - self._lock.acquire() + # self._lock.acquire() if self.shouldContinueWithMessage(request, response, html): response = GraphWaveResponse(request.getUrl().toString(), html) @@ -99,7 +99,7 @@ def processHttpMessage(self, toolFlag, messageIsRequest, requestResponse): self._config.include(request.getUrl().toString()) self._refreshInterface() - self._lock.release() + # self._lock.release() def shouldContinueWithMessage(self, request, response, html): """Check if a message could be ignored. A message can't be ignored if diff --git a/extension/GraphWaveResponse.py b/extension/GraphWaveResponse.py index 00b352f..f6c66a3 100644 --- a/extension/GraphWaveResponse.py +++ b/extension/GraphWaveResponse.py @@ -89,7 +89,7 @@ def __ne__(self, other): return str(self) != str(other) - def getSimilarity(self, other): + def getSimilarity(self, other, url): """Check if the given response is similar to this response. Returns: @@ -97,7 +97,9 @@ def getSimilarity(self, other): """ - structural = GraphWaveSimilarity.getStructuralSimilarity(self.html, other.html) - style = GraphWaveSimilarity.getStyleSimilarity(self.html, other.html) + similarity = GraphWaveSimilarity(self.html, other.html) + + structural = similarity.getStructuralSimilarity() + style = similarity.getStyleSimilarity() return (0.80 * structural) + (0.20 * style) diff --git a/extension/GraphWaveSimilarity.py b/extension/GraphWaveSimilarity.py index c6c8a2c..4aa4bb9 100644 --- a/extension/GraphWaveSimilarity.py +++ b/extension/GraphWaveSimilarity.py @@ -23,84 +23,80 @@ # SOFTWARE. import difflib -import re import hashlib +from GraphWaveDocumentParser import GraphWaveDocumentParser + class GraphWaveSimilarity: """The GraphWaveHttpListener listens to all spider packages flowing through Burp Suite. Attributes: - tags_regex (obj): A regular expression that helps to extract HTML tags. - classes_regex (obj): A regular expression that helps to extract HTML classes. - structural_cache dict(obj): A key/value cache for HTML structures. - style_cache dict(obj): A key/value cache for HTML classes. + cache dict(obj): A key/value cache for all kinds of slow functionality. """ - tags_regex = re.compile("<([a-zA-Z0-9]+)(([^<>])+)?>") + cache = {} + + def __init__(self, document1, document2): + """Initialzie the similarityh measure class. + + Args: + document1 (str): The first document to measure. + document2 (str): The second document to measure. - classes_regex = re.compile("class=(['\"`])(.+?)\1") + """ - structural_cache = {} + self.document1 = document1.encode('utf-8') + self.document2 = document2.encode('utf-8') - style_cache = {} + self.parser1 = self.doCache("parser", self.getHashOf(self.document1), lambda : GraphWaveDocumentParser(self.document1)) + self.parser2 = self.doCache("parser", self.getHashOf(self.document2), lambda : GraphWaveDocumentParser(self.document2)) - @staticmethod - def getJaccardSimilarity(set1, set2): - """Get the Jaccard distance between two sets. + def getHashOf(self, document): + """Get the hash for dict indexes. Args: - set1 (set): The first set to measure. - set2 (set): The second set to measure. + document (str): The document to hash. Returns: - (float): The Jaccard distance between the two sets. + str: The hash of the given document. """ - set1 = set(set1) - set2 = set(set2) + hash_object = hashlib.md5(document) + return hash_object.hexdigest() - intersection = len(set1 & set2) - - if len(set1) == 0 and len(set2) == 0: - return 1.0 - - denominator = len(set1) + len(set2) - intersection - return intersection / max(denominator, 0.000001) - - @staticmethod - def getStructuralSimilarity(document1, document2): - """Get the structural similarity between two documents. + def doCache(self, store, key, callback): + """Cache the given lambda in a store using the given key. Args: - document1 (str): The first document to measure. - document2 (str): The second document to measure. + store (str): The cache store to use. + key (str): The index key (usually a hash). + callback (lambda): The callback containing the value. Returns: - (float): The structural similarity between the two documents. + obj: The cached value of the callback function. """ - # SET 1 - hash_object1 = hashlib.md5(document1.encode('utf-8')) - hash_str1 = hash_object1.hexdigest() + if not store in GraphWaveSimilarity.cache.keys(): + GraphWaveSimilarity.cache[store] = {} + + if not key in GraphWaveSimilarity.cache[store].keys(): + GraphWaveSimilarity.cache[store][key] = callback() + + return GraphWaveSimilarity.cache[store][key] + + def getStructuralSimilarity(self): + """Get the structural similarity between two documents. - if hash_str1 in GraphWaveSimilarity.structural_cache.keys(): - tags1 = GraphWaveSimilarity.structural_cache[hash_str1] - else: - tags1 = GraphWaveSimilarity.getTagsFromDocument(document1.encode('utf-8')) - GraphWaveSimilarity.structural_cache[hash_str1] = tags1 + Returns: + (float): The structural similarity between the two documents. - # SET 2 - hash_object2 = hashlib.md5(document2.encode('utf-8')) - hash_str2 = hash_object2.hexdigest() + """ - if hash_str2 in GraphWaveSimilarity.structural_cache.keys(): - tags2 = GraphWaveSimilarity.structural_cache[hash_str2] - else: - tags2 = GraphWaveSimilarity.getTagsFromDocument(document2.encode('utf-8')) - GraphWaveSimilarity.structural_cache[hash_str2] = tags2 + tags1 = self.doCache("structural", self.getHashOf(self.document1), lambda : self.parser1.getTags()) + tags2 = self.doCache("structural", self.getHashOf(self.document2), lambda : self.parser2.getTags()) diff = difflib.SequenceMatcher() @@ -109,74 +105,38 @@ def getStructuralSimilarity(document1, document2): return diff.real_quick_ratio() - @staticmethod - def getStyleSimilarity(document1, document2): + def getStyleSimilarity(self): """Get the style similarity between two documents. - Args: - document1 (str): The first document to measure. - document2 (str): The second document to measure. - Returns: (float): The style similarity between the two documents. """ - # SET 1 - hash_object1 = hashlib.md5(document1.encode('utf-8')) - hash_str1 = hash_object1.hexdigest() - - if hash_str1 in GraphWaveSimilarity.style_cache.keys(): - classes_page1 = GraphWaveSimilarity.style_cache[hash_str1] - else: - GraphWaveSimilarity.style_cache[hash_str1] = GraphWaveSimilarity.getClassesFromDocument(document1) - classes_page1 = GraphWaveSimilarity.style_cache[hash_str1] + classes_page1 = self.doCache("style", self.getHashOf(self.document1), lambda : self.parser1.getClasses()) + classes_page2 = self.doCache("style", self.getHashOf(self.document2), lambda : self.parser2.getClasses()) - # SET 2 - hash_object2 = hashlib.md5(document2.encode('utf-8')) - hash_str2 = hash_object2.hexdigest() - - if hash_str2 in GraphWaveSimilarity.style_cache.keys(): - classes_page2 = GraphWaveSimilarity.style_cache[hash_str2] - else: - GraphWaveSimilarity.style_cache[hash_str2] = GraphWaveSimilarity.getClassesFromDocument(document2) - classes_page2 = GraphWaveSimilarity.style_cache[hash_str2] - - return GraphWaveSimilarity.getJaccardSimilarity(classes_page1, classes_page2) - - @staticmethod - def getTagsFromDocument(document): - """Get the HTML tags from a document. - - Args: - document (str): The document to get HTML tags from. - - Returns: - list(str): The tags/elements in the document. + return self.getJaccardSimilarity(classes_page1, classes_page2) - """ - - results = re.findall(GraphWaveSimilarity.tags_regex, document) - return list(results) - - @staticmethod - def getClassesFromDocument(document): - """Get the HTML style classes from a document. + def getJaccardSimilarity(self, set1, set2): + """Get the Jaccard distance between two sets. Args: - document (str): The document to get HTML style classes from. + set1 (set): The first set to measure. + set2 (set): The second set to measure. Returns: - set(str): The style classes in the document. + (float): The Jaccard distance between the two sets. """ - style_class_strings = re.findall(GraphWaveSimilarity.classes_regex, document) + set1 = set(set1) + set2 = set(set2) - result = set() + intersection = len(set1 & set2) - for style_class_string in style_class_strings: - for style_class in style_class_string.split(): - result.add(style_class) + if len(set1) == 0 and len(set2) == 0: + return 1.0 - return result + denominator = len(set1) + len(set2) - intersection + return intersection / max(denominator, 0.000001)