blues-lab · objorkman · Apr 19, 2022 · Apr 27, 2022 · nsamarin · Apr 20, 2022
diff --git a/polipy/constants.py b/polipy/constants.py
@@ -2,3 +2,20 @@
 
 UTC_DATE = datetime.datetime.utcnow().strftime('%Y%m%d')
 CWD = os.getcwd()
+
+KEYWORDS = {
+    'identifiers': [
+        'real name', 'alias', 'postal address', 'address', 'unique personal identifier',
+        'online identifier', 'IP address', 'email address', 'email', 'account name',
+        'social security number', 'driver license number', 'passport number'
+    ],
+    'customer records information': [
+        'name', 'signature', 'social security number', 'ssn',
+        'physical characteristics', 'address', 'telephone number', 'phone number',
+        'passport number', 'drivers license',
+        'state identification card number', 'insurance policy number',
+        'education', 'employment', 'employment history', 'bank account number',
+        'credit card number', 'debit card number', 'financial information',
+        'medical information', 'health insurance information'
+    ]
+}
diff --git a/polipy/extractors.py b/polipy/extractors.py
@@ -1,6 +1,7 @@
 from bs4 import BeautifulSoup
 from io import BytesIO, StringIO
 from pdfminer.high_level import extract_text as parse_pdf
+from .constants import KEYWORDS
 
 extractors = [
     'text'
@@ -11,8 +12,10 @@ def extract(extractor, **kwargs):
         content = extract_text(**kwargs)
     return content
 
-def extract_text(url_type, url=None, dynamic_source=None, static_source=None, **kwargs):
-    if url_type is None or url_type in ['html', 'other']:
+def extract_text(url_type, url=None, dynamic_source=None, static_source=None, html_file=None, **kwargs):
+    if html_file is not None:
+        content = extract_ccpa_info(html_file)
+    elif url_type is None or url_type in ['html', 'other']:
         content = extract_html(dynamic_source, url)
     elif url_type == 'pdf':
         content = extract_pdf(static_source)
@@ -40,6 +43,25 @@ def extract_google_docs(source):
     text = ''.join(chunks).strip()
     return text
 
+def extract_ccpa_info(html_file):
+    with open(html_file) as fp:
+        soup = BeautifulSoup(fp, features="html.parser")
+        text = soup.get_text().lower()
+        result = ''
+        for category, values in KEYWORDS.items():
+            substring = category + ': '
+            for v in values:
+                if v in text:
+                    substring += v + ','
+                # words = v.split()
+                # if len(words) > 1:
+                #     for w in words:
+                #         if w in text:
+                #             substring += w + ','
+            result += substring + '\n'
+        print(result)
+        return result
+
 def extract_other(source):
     """
     From https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python

diff --git a/polipy/polipy.py b/polipy/polipy.py
@@ -3,6 +3,7 @@
 from .constants import UTC_DATE, CWD
 from .exceptions import NetworkIOException, ParserException
 from .logger import get_logger
+from bs4 import BeautifulSoup
 
 import os
 import json
@@ -27,7 +28,7 @@ class Policy:
 
     url, source, content = {}, {}, {}
 
-    def __init__(self, url):
+    def __init__(self, url, html_file=None):
         """
         Constructor method. Populates the `Policy.url` attribute.
 
@@ -39,6 +40,7 @@ def __init__(self, url):
         self.url['url'] = url
         self.url = self.url | parse_url(url)
         self.url['domain'] = self.url['domain'].strip().strip('.').strip('/')
+        self.html_file = html_file
 
         # Generate the hash to avoid collisions in output file names.
         self.url['hash'] = hashlib.md5(url.encode()).hexdigest()[:10]
@@ -107,6 +109,7 @@ def extract(self, extractors=['text']):
             'url_type': self.url['type'],
             'static_source': self.source['static_html'],
             'dynamic_source': self.source['dynamic_html'],
+            'html_file': self.html_file
         }
         for extractor in extractors:
             content = extract(extractor, **vargs)
@@ -186,7 +189,7 @@ def __repr__(self):
         return '{}({})'.format(self.__class__, self.to_dict())
 
 # Public module methods.
-def get_policy(url, screenshot=False, timeout=30, extractors=['text'], **kwargs):
+def get_policy(url, html_file=None, screenshot=False, timeout=30, extractors=['text'], **kwargs):
     """
     Helper method that returns a `polipy.Policy` object containing
     information about the policy, scraped and processed from the given URL.
@@ -195,6 +198,8 @@ def get_policy(url, screenshot=False, timeout=30, extractors=['text'], **kwargs)
     ----------
     url : str
         The URL of the privacy policy.
+    html_file : str
+        Relative location of HTML file to process.
     screenshot : bool, optional
         Flag that indicates whether to capture and save the screenshot of the privacy policy page (default is `False`).
     timeout : int, optional
@@ -218,7 +223,7 @@ def get_policy(url, screenshot=False, timeout=30, extractors=['text'], **kwargs)
     ParserException
         Raised if an error occured while extracting text from page source.
     """
-    policy = Policy(url)
+    policy = Policy(url, html_file)
     policy.scrape(screenshot=screenshot, timeout=timeout)
     policy.extract(extractors=extractors)
     return policy