Skip to content
This repository has been archived by the owner on May 2, 2020. It is now read-only.

Commit

Permalink
Fixed stack overflow in regex recursion.
Browse files Browse the repository at this point in the history
  • Loading branch information
Tijme Gommers committed May 24, 2018
1 parent 67a1579 commit a40c0aa
Show file tree
Hide file tree
Showing 7 changed files with 193 additions and 107 deletions.
2 changes: 1 addition & 1 deletion .semver
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0.1
1.0.2
2 changes: 1 addition & 1 deletion extension/ExtensionDetails.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class ExtensionDetails:

VERSION = "Unknown"

DEBUG = True
DEBUG = False

STATUS_LOADING = "loading"
STATUS_DISABLED = "disabled"
Expand Down
2 changes: 1 addition & 1 deletion extension/GraphWave.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def getMatchingPoints(self, response, properties):
stylometry_value = 0

for response_in_graph in self.responses():
similarity = response.getSimilarity(response_in_graph)
similarity = response.getSimilarity(response_in_graph, response.url)
if similarity > self.options["mst"]:
stylometry_count += 1
stylometry_value += similarity
Expand Down
124 changes: 124 additions & 0 deletions extension/GraphWaveDocumentParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# -*- coding: utf-8 -*-

# MIT License
#
# Copyright (c) 2018 Tijme Gommers
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

class GraphWaveDocumentParser:
"""A helper class for extracting certain characteristic from HTML documents."""

def __init__(self, document):
"""Initialize the document parser with the given HTML document string.
Args:
document (str): The HTML document to parse.
"""

self.document = document
self.parsed = False

self.is_in_element = False
self.is_in_attribute = False
self.is_in_class_attribute = False

self.class_attributes = {}
self.classes = []
self.elements = {}
self.element_index = 0

self.attribute_quotes = ["\"", "'", "`"]
self.attribute_quote = None

self.parse()

def addToIndex(self, arr, indx, char):
"""Append a certain character to an element on the given index.
Args:
arr (obj): The dict to add the char to.
indx (int): The index to place the character on.
char (str): The character to append.
"""

if indx in arr:
arr[indx] += char
else:
arr[indx] = char

def parse(self):
"""Parse the document by extracting tags and classes."""

if self.parsed:
return

for character in self.document:
if not self.is_in_element and character == "<":
self.is_in_element = True

if self.is_in_element:
self.addToIndex(self.elements, self.element_index, character)

if self.is_in_attribute and character == self.attribute_quote:
self.is_in_attribute = False
self.is_in_class_attribute = False
self.attribute_quote = None
else:
if self.is_in_class_attribute:
self.addToIndex(self.class_attributes, self.element_index, character)

if not self.is_in_attribute and self.is_in_element and character in self.attribute_quotes:
self.attribute_quote = character
self.is_in_attribute = True

if self.elements[self.element_index][-7:-2] == "class":
self.is_in_class_attribute = True

if self.is_in_element and character == ">" and not self.is_in_attribute:
self.is_in_element = False
self.element_index += 1

for class_attribute in self.class_attributes.values():
if class_attribute:
self.classes.extend(class_attribute.split(" "))

self.parsed = True

def getTags(self):
"""Return HTML elements/tags of the parsed document.
Returns:
list(str): The HTML elements/tags of the parsed document.
"""

return self.elements.values()

def getClasses(self):
"""Return CSS classes of the parsed document.
Returns:
list(str): The CSS classes of the parsed document.
"""

return self.classes
4 changes: 2 additions & 2 deletions extension/GraphWaveHttpListener.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def processHttpMessage(self, toolFlag, messageIsRequest, requestResponse):
response = self._helpers.analyzeResponse(requestResponse.getResponse())
html = self._helpers.bytesToString(requestResponse.getResponse())

self._lock.acquire()
# self._lock.acquire()

if self.shouldContinueWithMessage(request, response, html):
response = GraphWaveResponse(request.getUrl().toString(), html)
Expand All @@ -99,7 +99,7 @@ def processHttpMessage(self, toolFlag, messageIsRequest, requestResponse):
self._config.include(request.getUrl().toString())

self._refreshInterface()
self._lock.release()
# self._lock.release()

def shouldContinueWithMessage(self, request, response, html):
"""Check if a message could be ignored. A message can't be ignored if
Expand Down
8 changes: 5 additions & 3 deletions extension/GraphWaveResponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,17 @@ def __ne__(self, other):

return str(self) != str(other)

def getSimilarity(self, other):
def getSimilarity(self, other, url):
"""Check if the given response is similar to this response.
Returns:
float: The Jaccard distance (similarity measure).
"""

structural = GraphWaveSimilarity.getStructuralSimilarity(self.html, other.html)
style = GraphWaveSimilarity.getStyleSimilarity(self.html, other.html)
similarity = GraphWaveSimilarity(self.html, other.html)

structural = similarity.getStructuralSimilarity()
style = similarity.getStyleSimilarity()

return (0.80 * structural) + (0.20 * style)
Loading

0 comments on commit a40c0aa

Please sign in to comment.