blues-lab · objorkman · Apr 19, 2022 · Apr 27, 2022 · nsamarin · Apr 20, 2022
diff --git a/polipy/constants.py b/polipy/constants.py
@@ -2,3 +2,45 @@
 
 UTC_DATE = datetime.datetime.utcnow().strftime('%Y%m%d')
 CWD = os.getcwd()
+
+KEYWORDS = {
+    'identifiers': [
+        'real name', 'alias', 'postal address', 'address',
+        'unique personal identifier', 'online identifier', 'IP address',
+        'email address', 'email', 'account name', 'social security number',
+        'driver license number', 'passport number'
+    ],
+    'customer records information': [
+        'name', 'signature', 'social security number', 'ssn',
+        'physical characteristics', 'address', 'telephone number',
+        'phone number', 'passport number', 'drivers license',
+        'state identification card number', 'insurance policy number',
+        'education', 'employment', 'employment history', 'bank account number',
+        'credit card number', 'debit card number', 'financial information',
+        'medical information', 'health insurance information'
+    ],
+    'characteristics of protected classifications': [
+        'race', 'ancestry', 'national origin', 'religion', 'age',
+        'mental and physical disability', 'sex', 'sexual orientation',
+        'gender identity', 'medical condition', 'genetic information',
+        'marital status', 'military status'
+    ],
+    'commercial information': [
+        'personal property', 'products purchased', 'services purchased',
+        'purchasing histories', 'consuming histories'
+    ],
+    'internet or other electronic network activity information': [
+        'browsing history', 'search history',
+        'interaction with a website, application, or advertisement'
+    ],
+    'geolocation data': ['geolocation data', 'location information', 'gps'],
+    'sensory data': ['Audio', 'electronic', 'visual', 'thermal', 'olfactory'],
+    'professional or employment-related information':
+    ['employment information', 'professional information'],
+    'education information':
+    ['Family Educational Rights and Privacy Act', 'education', 'school'],
+    'inferences': [
+        'psychological trends', 'predispositions', 'behavior', 'attitudes',
+        'intelligence', 'aptitude'
+    ]
+}
diff --git a/polipy/extractors.py b/polipy/extractors.py
@@ -1,14 +1,18 @@
 from bs4 import BeautifulSoup
 from io import BytesIO, StringIO
 from pdfminer.high_level import extract_text as parse_pdf
+from .constants import KEYWORDS
 
 extractors = [
-    'text'
+    'text',
+    'keywords'
 ]
 
 def extract(extractor, **kwargs):
     if extractor == 'text':
         content = extract_text(**kwargs)
+    elif extractor == 'keywords' and 'html_file' in kwargs and kwargs['html_file'] is not None:
+        content = extract_ccpa_info(kwargs['html_file'])
     return content
 
 def extract_text(url_type, url=None, dynamic_source=None, static_source=None, **kwargs):
@@ -40,6 +44,22 @@ def extract_google_docs(source):
     text = ''.join(chunks).strip()
     return text
 
+def extract_ccpa_info(html_file):
+    with open(html_file) as fp:
+        soup = BeautifulSoup(fp, features="html.parser")
+        text = soup.get_text().lower()
+        result = [['CATEGORY']]
+        for category, values in KEYWORDS.items():
+            row = [category]
+            for v in values:
+                if v in text:
+                    row.append(v)
+            if len(row) > 0:
+                result.append(row)
+            else:
+                result.append('N/A')
+        return result
+
 def extract_other(source):
     """
     From https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python

diff --git a/polipy/polipy.py b/polipy/polipy.py
@@ -9,6 +9,7 @@
 import pathlib
 import hashlib
 import logging
+import csv
 
 # Public module class.
 class Policy:
@@ -27,7 +28,7 @@ class Policy:
 
     url, source, content = {}, {}, {}
 
-    def __init__(self, url):
+    def __init__(self, url, html_file=None):
         """
         Constructor method. Populates the `Policy.url` attribute.
 
@@ -39,6 +40,7 @@ def __init__(self, url):
         self.url['url'] = url
         self.url = self.url | parse_url(url)
         self.url['domain'] = self.url['domain'].strip().strip('.').strip('/')
+        self.html_file = html_file
 
         # Generate the hash to avoid collisions in output file names.
         self.url['hash'] = hashlib.md5(url.encode()).hexdigest()[:10]
@@ -107,6 +109,7 @@ def extract(self, extractors=['text']):
             'url_type': self.url['type'],
             'static_source': self.source['static_html'],
             'dynamic_source': self.source['dynamic_html'],
+            'html_file': self.html_file
         }
         for extractor in extractors:
             content = extract(extractor, **vargs)
@@ -142,6 +145,7 @@ def save(self, output_dir, **kwargs):
             'pdf': os.path.join(policy_output_dir, '{}.{}'.format(UTC_DATE, 'pdf')),
             'txt': os.path.join(policy_output_dir, '{}.{}'.format(UTC_DATE, 'txt')),
             'meta': os.path.join(policy_output_dir, '{}.{}'.format(UTC_DATE, 'meta')),
+            'csv': os.path.join(policy_output_dir, '{}.{}'.format(UTC_DATE, 'csv')),
         }
 
         meta = {'last_scraped': UTC_DATE} | self.url
@@ -165,6 +169,11 @@ def save(self, output_dir, **kwargs):
         if len(self.url) > 0:
             with open(output['meta'], 'w') as f:
                 json.dump(meta, f)
+        if self.html_file is not None:
+            with open(output['csv'], 'w') as f:
+                writer = csv.writer(f)
+                for row in self.content['keywords']:
+                    writer.writerow(row)
 
     def to_dict(self):
         """
@@ -186,7 +195,7 @@ def __repr__(self):
         return '{}({})'.format(self.__class__, self.to_dict())
 
 # Public module methods.
-def get_policy(url, screenshot=False, timeout=30, extractors=['text'], **kwargs):
+def get_policy(url, html_file=None, screenshot=False, timeout=30, extractors=['text'], **kwargs):
     """
     Helper method that returns a `polipy.Policy` object containing
     information about the policy, scraped and processed from the given URL.
@@ -195,6 +204,8 @@ def get_policy(url, screenshot=False, timeout=30, extractors=['text'], **kwargs)
     ----------
     url : str
         The URL of the privacy policy.
+    html_file : str
+        Relative location of HTML file to process.
     screenshot : bool, optional
         Flag that indicates whether to capture and save the screenshot of the privacy policy page (default is `False`).
     timeout : int, optional
@@ -218,7 +229,7 @@ def get_policy(url, screenshot=False, timeout=30, extractors=['text'], **kwargs)
     ParserException
         Raised if an error occured while extracting text from page source.
     """
-    policy = Policy(url)
+    policy = Policy(url, html_file)
     policy.scrape(screenshot=screenshot, timeout=timeout)
     policy.extract(extractors=extractors)
     return policy